From 3359c1e5ddd18de6ace98aacdd0e38550999275e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 30 May 2019 15:58:12 -0700
Subject: [PATCH 001/926] vp8: restrict 1st pass cpu_used range

< 4 isn't meaningful in the first pass; additional analysis will be
done, but thrown out, unnecessarily increasing the runtime.

Change-Id: Ic3de77e3eaa7a8a3371f76f84693e9655c60fdba
---
 vp8/vp8_cx_iface.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index d65bf9652e..bda109e7a3 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -370,6 +370,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
 #endif
 
   oxcf->cpu_used = vp8_cfg.cpu_used;
+  if (cfg.g_pass == VPX_RC_FIRST_PASS) {
+    oxcf->cpu_used = VPXMAX(4, oxcf->cpu_used);
+  }
   oxcf->encode_breakout = vp8_cfg.static_thresh;
   oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
   oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;

From 7370cecd8929141adb8140b924d3dd8ac1887d36 Mon Sep 17 00:00:00 2001
From: angiebird <angiebird@google.com>
Date: Mon, 10 Aug 2020 15:37:24 -0700
Subject: [PATCH 002/926] Close out file in EndEncode()

Change-Id: Ib6549f954ce6d5d966eef09a119b46f0cc2f54f7
---
 vp9/simple_encode.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 46b25d1fdf..ba076fd586 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -959,6 +959,10 @@ void SimpleEncode::EndEncode() {
   impl_ptr_->cpi = nullptr;
   vpx_img_free(&impl_ptr_->tmp_img);
   rewind(in_file_);
+  if (out_file_ != nullptr) {
+    fclose(out_file_);
+    out_file_ = nullptr;
+  }
 }
 
 void SimpleEncode::UpdateKeyFrameGroup(int key_frame_show_index) {

From 8b8b15e086dae3ff99c6096c5c6b6b85eb2d017a Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 27 Oct 2020 11:02:13 -0700
Subject: [PATCH 003/926] Add cmd line option to control loopfilter for vpxenc

Change-Id: I4f5e6ce2f1b535a586bdb6c9e55a3d49ebf61af4
---
 vpxenc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vpxenc.c b/vpxenc.c
index 64288e83d2..8c92f23917 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -465,6 +465,13 @@ static const arg_def_t target_level = ARG_DEF(
 static const arg_def_t row_mt =
     ARG_DEF(NULL, "row-mt", 1,
             "Enable row based non-deterministic multi-threading in VP9");
+
+static const arg_def_t disable_loopfilter =
+    ARG_DEF(NULL, "disable-loopfilter", 1,
+            "Control Loopfilter in VP9\n"
+            "0: Loopfilter on for all frames (default)\n"
+            "1: Loopfilter off for non reference frames\n"
+            "2: Loopfilter off for all frames");
 #endif
 
 #if CONFIG_VP9_ENCODER
@@ -495,6 +502,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
                                        &max_gf_interval,
                                        &target_level,
                                        &row_mt,
+                                       &disable_loopfilter,
 #if CONFIG_VP9_HIGHBITDEPTH
                                        &bitdeptharg,
                                        &inbitdeptharg,
@@ -527,6 +535,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP9E_SET_MAX_GF_INTERVAL,
                                         VP9E_SET_TARGET_LEVEL,
                                         VP9E_SET_ROW_MT,
+                                        VP9E_SET_DISABLE_LOOPFILTER,
                                         0 };
 #endif
 

From 89ddf6f32a94c636ac9e6de4096295a058269222 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 27 Oct 2020 17:09:08 -0700
Subject: [PATCH 004/926] vp9_ext_ratectrl_test: add missing override

for ~ExtRateCtrlTest()

Change-Id: I311a400093c8c1ee2c002ba000d0b33c4fde209f
---
 test/vp9_ext_ratectrl_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index a8c4032d45..8db0a358d0 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -150,7 +150,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
  protected:
   ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {}
 
-  ~ExtRateCtrlTest() = default;
+  ~ExtRateCtrlTest() override = default;
 
   void SetUp() override {
     InitializeConfig();

From 8b27a92490347d7e5e818a9371783c17dc0a4da8 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sun, 20 Sep 2020 11:57:13 -0700
Subject: [PATCH 005/926] Add a comment about bitdeptharg and inbitdeptharg

Add a comment to vp9_args to point out that bitdeptharg and
inbitdeptharg do not have a corresponding entry in vp9_arg_ctrl_map and
must be listed at the end of vp9_args.

Change-Id: Ic9834ab72599c067156ca5a315824c7f0760824a
---
 vpxenc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vpxenc.c b/vpxenc.c
index 8c92f23917..5d7546eb28 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -503,6 +503,9 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
                                        &target_level,
                                        &row_mt,
                                        &disable_loopfilter,
+// NOTE: The entries above have a corresponding entry in vp9_arg_ctrl_map. The
+// entries below do not have a corresponding entry in vp9_arg_ctrl_map. They
+// must be listed at the end of vp9_args.
 #if CONFIG_VP9_HIGHBITDEPTH
                                        &bitdeptharg,
                                        &inbitdeptharg,

From 9ab65c55d987be33601c6922ce62a1456b122fe2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 29 Oct 2020 15:46:21 -0700
Subject: [PATCH 006/926] libs.mk: set LC_ALL=C w/egrep invocations

this guarantees consistent interpretation of the character ranges

BUG=webm:1711

Change-Id: Ia9123f079cc7ac248b9eff4d817e2e103d627b2b
---
 libs.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs.mk b/libs.mk
index d14439a3da..b5bc35755c 100644
--- a/libs.mk
+++ b/libs.mk
@@ -420,13 +420,13 @@ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
 # YASM
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " equ " $$3}' > $@
 else
 ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION))
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@
 	@echo "        END" $(ADS2GAS) >> $@
 CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm

From b1d704f12af9b96b39ce1e1493c36bb4b3a3fb2a Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Thu, 5 Nov 2020 15:26:54 -0800
Subject: [PATCH 007/926] Accumulate frame tpl stats and pass through rate
 control api

Tpl stats is computed at the beginning of encoding the altref
frame. We aggregate tpl stats of all blocks for every frame of
the current group of picture.

After the altref frame is encoded, the tpl stats is passed through
the encode frame result to external environment.
Change-Id: I2284f8cf9c45d35ba02f3ea45f0187edbbf48294
---
 vp9/encoder/vp9_encoder.c | 53 ++++++++++++++++++++++++++++++++++++++-
 vp9/encoder/vp9_encoder.h | 19 ++++++++++++++
 vp9/simple_encode.cc      | 30 ++++++++++++++++++++--
 vp9/simple_encode.h       | 30 ++++++++++++++++++++++
 4 files changed, 129 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f2e6ba1acd..f4587d42d9 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1025,6 +1025,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   free_partition_info(cpi);
   free_motion_vector_info(cpi);
   free_fp_motion_vector_info(cpi);
+  free_tpl_stats_info(cpi);
 #endif
 
   vp9_free_ref_frame_buffers(cm->buffer_pool);
@@ -2665,6 +2666,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   partition_info_init(cpi);
   motion_vector_info_init(cpi);
   fp_motion_vector_info_init(cpi);
+  tpl_stats_info_init(cpi);
 #endif
 
   return cpi;
@@ -5306,6 +5308,7 @@ static void update_encode_frame_result(
 #if CONFIG_RATE_CTRL
     const PARTITION_INFO *partition_info,
     const MOTION_VECTOR_INFO *motion_vector_info,
+    const TplDepStats *tpl_stats_info,
 #endif  // CONFIG_RATE_CTRL
     ENCODE_FRAME_RESULT *encode_frame_result);
 #endif  // !CONFIG_REALTIME_ONLY
@@ -5520,7 +5523,7 @@ static void encode_frame_to_data_rate(
         cpi->Source, coded_frame_buf, ref_frame_bufs, vp9_get_quantizer(cpi),
         cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts,
 #if CONFIG_RATE_CTRL
-        cpi->partition_info, cpi->motion_vector_info,
+        cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info,
 #endif  // CONFIG_RATE_CTRL
         encode_frame_result);
   }
@@ -7371,6 +7374,48 @@ static void free_tpl_buffer(VP9_COMP *cpi) {
   }
 }
 
+#if CONFIG_RATE_CTRL
+static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int show_frame_count = 0;
+  int frame_idx;
+  // Accumulate tpl stats for each frame in the current group of picture.
+  for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    if (!tpl_frame->is_valid) continue;
+
+    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+    const int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t inter_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    int64_t mc_ref_cost_base = 0;
+    int64_t mc_flow_base = 0;
+    int row, col;
+
+    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+      for (col = 0; col < cm->mi_cols; ++col) {
+        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+        intra_cost_base += this_stats->intra_cost;
+        inter_cost_base += this_stats->inter_cost;
+        mc_dep_cost_base += this_stats->mc_dep_cost;
+        mc_ref_cost_base += this_stats->mc_ref_cost;
+        mc_flow_base += this_stats->mc_flow;
+      }
+    }
+
+    cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
+    cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
+
+    ++show_frame_count;
+  }
+}
+#endif  // CONFIG_RATE_CTRL
+
 static void setup_tpl_stats(VP9_COMP *cpi) {
   GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -7393,6 +7438,10 @@ static void setup_tpl_stats(VP9_COMP *cpi) {
   dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
 #endif  // DUMP_TPL_STATS
 #endif  // CONFIG_NON_GREEDY_MV
+
+#if CONFIG_RATE_CTRL
+  accumulate_frame_tpl_stats(cpi);
+#endif  // CONFIG_RATE_CTRL
 }
 
 void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
@@ -7575,6 +7624,7 @@ static void update_encode_frame_result(
 #if CONFIG_RATE_CTRL
     const PARTITION_INFO *partition_info,
     const MOTION_VECTOR_INFO *motion_vector_info,
+    const TplDepStats *tpl_stats_info,
 #endif  // CONFIG_RATE_CTRL
     ENCODE_FRAME_RESULT *encode_frame_result) {
 #if CONFIG_RATE_CTRL
@@ -7598,6 +7648,7 @@ static void update_encode_frame_result(
   copy_frame_counts(counts, &encode_frame_result->frame_counts);
   encode_frame_result->partition_info = partition_info;
   encode_frame_result->motion_vector_info = motion_vector_info;
+  encode_frame_result->tpl_stats_info = tpl_stats_info;
   if (encode_frame_result->coded_frame.allocated) {
     yv12_buffer_to_image_buffer(&coded_frame_buf->buf,
                                 &encode_frame_result->coded_frame);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 91cb6f5b14..8763a5e789 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -973,6 +973,7 @@ typedef struct VP9_COMP {
   PARTITION_INFO *partition_info;
   MOTION_VECTOR_INFO *motion_vector_info;
   MOTION_VECTOR_INFO *fp_motion_vector_info;
+  TplDepStats *tpl_stats_info;
 
   RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES];
 #endif
@@ -1029,6 +1030,23 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) {
   cpi->motion_vector_info = NULL;
 }
 
+// Allocates memory for the tpl stats information.
+// Only called once in vp9_create_compressor().
+static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CHECK_MEM_ERROR(
+      cm, cpi->tpl_stats_info,
+      (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats)));
+  memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats));
+}
+
+// Frees memory of the tpl stats information.
+// Only called once in dealloc_compressor_data().
+static INLINE void free_tpl_stats_info(struct VP9_COMP *cpi) {
+  vpx_free(cpi->tpl_stats_info);
+  cpi->tpl_stats_info = NULL;
+}
+
 // Allocates memory for the first pass motion vector information.
 // The unit size is each 16x16 block.
 // Only called once in vp9_create_compressor().
@@ -1091,6 +1109,7 @@ typedef struct ENCODE_FRAME_RESULT {
   FRAME_COUNTS frame_counts;
   const PARTITION_INFO *partition_info;
   const MOTION_VECTOR_INFO *motion_vector_info;
+  const TplDepStats *tpl_stats_info;
   IMAGE_BUFFER coded_frame;
   RATE_QINDEX_HISTORY rq_history;
 #endif  // CONFIG_RATE_CTRL
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 568df97aaa..afda6e2035 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -207,6 +207,24 @@ static void update_motion_vector_info(
   }
 }
 
+static void update_tpl_stats_info(const TplDepStats *input_tpl_stats_info,
+                                  const int show_frame_count,
+                                  TplStatsInfo *output_tpl_stats_info) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < show_frame_count; ++frame_idx) {
+    output_tpl_stats_info[frame_idx].intra_cost =
+        input_tpl_stats_info[frame_idx].intra_cost;
+    output_tpl_stats_info[frame_idx].inter_cost =
+        input_tpl_stats_info[frame_idx].inter_cost;
+    output_tpl_stats_info[frame_idx].mc_flow =
+        input_tpl_stats_info[frame_idx].mc_flow;
+    output_tpl_stats_info[frame_idx].mc_dep_cost =
+        input_tpl_stats_info[frame_idx].mc_dep_cost;
+    output_tpl_stats_info[frame_idx].mc_ref_cost =
+        input_tpl_stats_info[frame_idx].mc_ref_cost;
+  }
+}
+
 static void update_frame_counts(const FRAME_COUNTS *input_counts,
                                 FrameCounts *output_counts) {
   // Init array sizes.
@@ -486,6 +504,7 @@ static bool init_encode_frame_result(EncodeFrameResult *encode_frame_result,
                                              encode_frame_result->num_cols_4x4);
   encode_frame_result->motion_vector_info.resize(
       encode_frame_result->num_rows_4x4 * encode_frame_result->num_cols_4x4);
+  encode_frame_result->tpl_stats_info.resize(MAX_LAG_BUFFERS);
 
   if (encode_frame_result->coding_data.get() == nullptr) {
     return false;
@@ -507,7 +526,7 @@ static void encode_frame_result_update_rq_history(
 }
 
 static void update_encode_frame_result(
-    EncodeFrameResult *encode_frame_result,
+    EncodeFrameResult *encode_frame_result, const int show_frame_count,
     const ENCODE_FRAME_RESULT *encode_frame_info) {
   encode_frame_result->coding_data_bit_size =
       encode_frame_result->coding_data_byte_size * 8;
@@ -536,6 +555,10 @@ static void update_encode_frame_result(
                             kMotionVectorSubPixelPrecision);
   update_frame_counts(&encode_frame_info->frame_counts,
                       &encode_frame_result->frame_counts);
+  if (encode_frame_result->frame_type == kFrameTypeAltRef) {
+    update_tpl_stats_info(encode_frame_info->tpl_stats_info, show_frame_count,
+                          &encode_frame_result->tpl_stats_info[0]);
+  }
   encode_frame_result_update_rq_history(&encode_frame_info->rq_history,
                                         encode_frame_result);
 }
@@ -1169,7 +1192,10 @@ void SimpleEncode::EncodeFrame(EncodeFrameResult *encode_frame_result) {
       abort();
     }
 
-    update_encode_frame_result(encode_frame_result, &encode_frame_info);
+    const GroupOfPicture group_of_picture = this->ObserveGroupOfPicture();
+    const int show_frame_count = group_of_picture.show_frame_count;
+    update_encode_frame_result(encode_frame_result, show_frame_count,
+                               &encode_frame_info);
     PostUpdateState(*encode_frame_result);
   } else {
     // TODO(angiebird): Clean up encode_frame_result.
diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h
index ce370a795e..380e8118fc 100644
--- a/vp9/simple_encode.h
+++ b/vp9/simple_encode.h
@@ -87,6 +87,24 @@ struct MotionVectorInfo {
   double mv_column[2];
 };
 
+// Accumulated tpl stats of all blocks in one frame.
+// For each frame, the tpl stats are computed per 32x32 block.
+struct TplStatsInfo {
+  // Intra complexity: the sum of absolute transform difference (SATD) of
+  // intra predicted residuals.
+  int64_t intra_cost;
+  // Inter complexity: the SATD of inter predicted residuals.
+  int64_t inter_cost;
+  // Motion compensated information flow. It measures how much information
+  // is propagated from the current frame to other frames.
+  int64_t mc_flow;
+  // Motion compensated dependency cost. It equals to its own intra_cost
+  // plus the mc_flow.
+  int64_t mc_dep_cost;
+  // Motion compensated reference cost.
+  int64_t mc_ref_cost;
+};
+
 struct RefFrameInfo {
   int coding_indexes[kRefFrameTypeMax];
 
@@ -261,6 +279,18 @@ struct EncodeFrameResult {
   // Similar to partition info, all 4x4 blocks inside the same partition block
   // share the same motion vector information.
   std::vector<MotionVectorInfo> motion_vector_info;
+  // A vector of the tpl stats information.
+  // The tpl stats measure the complexity of a frame, as well as the
+  // informatioin propagated along the motion trajactory between frames, in
+  // the reference frame structure.
+  // The tpl stats could be used as a more accurate spatial and temporal
+  // complexity measure in addition to the first pass stats.
+  // The vector contains tpl stats for all show frames in a GOP.
+  // The tpl stats stored in the vector is according to the encoding order.
+  // For example, suppose there are N show frames for the current GOP.
+  // Then tpl_stats_info[0] stores the information of the first frame to be
+  // encoded for this GOP, i.e, the AltRef frame.
+  std::vector<TplStatsInfo> tpl_stats_info;
   ImageBuffer coded_frame;
 
   // recode_count, q_index_history and rate_history are only available when

From 7beafefd16b9d41eaf0bfc09e6bbb843ada9e952 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 11 Nov 2020 23:11:16 -0800
Subject: [PATCH 008/926] vp9: Allow for disabling loopfilter per spatial layer

For SVC: add parameter to the control SET_SVC_PARAMS to
allow for disabling the loopfilter per spatial layer.
Note this svc setting will override the setting via
VP9E_SET_DISABLE_LOOPFILTER (which should only be used
for non-SVC).

Add unittest to handle both SVC (spatial or temporal layers)
and non-SVC (single layer) case.

Change-Id: I4092f01668bae42aac724a6df5b6f6a604337448
---
 test/svc_end_to_end_test.cc        | 85 +++++++++++++++++++++++++++++-
 test/svc_test.cc                   | 13 ++---
 vp9/encoder/vp9_svc_layercontext.c |  2 +
 vp9/encoder/vp9_svc_layercontext.h |  1 +
 vp9/vp9_cx_iface.c                 |  1 +
 vpx/vpx_encoder.h                  |  1 +
 6 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index b1ab0d7d95..518824d03f 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -494,7 +494,31 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     PreEncodeFrameHookSetup(video, encoder);
-    encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_);
+    if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) {
+      // Consider 3 cases:
+      if (loopfilter_off_ == 0) {
+        // loopfilter is on for all spatial layers on every superrframe.
+        for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 0;
+        }
+      } else if (loopfilter_off_ == 1) {
+        // loopfilter is off for non-reference frames for all spatial layers.
+        for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 1;
+        }
+      } else {
+        // loopfilter is off for all SL0 frames, and off only for non-reference
+        // frames for SL > 0.
+        svc_params_.loopfilter_ctrl[0] = 2;
+        for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 1;
+        }
+      }
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+    } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 1) {
+      // For non-SVC mode use the single layer control.
+      encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_);
+    }
   }
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
@@ -524,6 +548,35 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
   int num_nonref_frames_;
 };
 
+TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL1TLLoopfilterOff) {
+  SetSvcConfig(1, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.temporal_layering_mode = 0;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  if (loopfilter_off_ == 0)
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  else
+    EXPECT_EQ(GetMismatchFrames(), 0);
+#endif
+}
+
 TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) {
   SetSvcConfig(1, 3);
   cfg_.rc_buf_initial_sz = 500;
@@ -542,7 +595,37 @@ TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) {
   cfg_.ts_rate_decimator[1] = 2;
   cfg_.ts_rate_decimator[2] = 1;
   cfg_.temporal_layering_mode = 3;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  if (loopfilter_off_ == 0)
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  else
+    EXPECT_EQ(GetMismatchFrames(), 0);
+#endif
+}
 
+TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc3SL3TLLoopfilterOff) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.temporal_layering_mode = 3;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   cfg_.rc_target_bitrate = 600;
diff --git a/test/svc_test.cc b/test/svc_test.cc
index 4798c77183..cbc0abe032 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -43,13 +43,14 @@ void OnePassCbrSvc::PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
       svc_params_.max_quantizers[i] = 63;
       svc_params_.min_quantizers[i] = 0;
     }
-    svc_params_.speed_per_layer[0] = base_speed_setting_;
-    for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
-      svc_params_.speed_per_layer[i] = speed_setting_;
+    if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) {
+      svc_params_.speed_per_layer[0] = base_speed_setting_;
+      for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
+        svc_params_.speed_per_layer[i] = speed_setting_;
+      }
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
     }
-
-    encoder->Control(VP9E_SET_SVC, 1);
-    encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
     encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
     encoder->Control(VP9E_SET_AQ_MODE, 3);
     encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index d85b3632cd..9c75d77263 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -357,6 +357,8 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) {
     cpi->oxcf.speed = lc->speed;
   }
+  if (lc->loopfilter_ctrl >= 0 || lc->loopfilter_ctrl < 3)
+    cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1 ||
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index e7d9712aae..b12e7e01a7 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -71,6 +71,7 @@ typedef struct {
   int actual_num_seg2_blocks;
   int counter_encode_maxq_scene_change;
   uint8_t speed;
+  int loopfilter_ctrl;
 } LAYER_CONTEXT;
 
 typedef struct SVC {
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index aa13fc9cf1..0ccb6750b6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1573,6 +1573,7 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
       lc->scaling_factor_num = params->scaling_factor_num[sl];
       lc->scaling_factor_den = params->scaling_factor_den[sl];
       lc->speed = params->speed_per_layer[sl];
+      lc->loopfilter_ctrl = params->loopfilter_ctrl[sl];
     }
   }
 
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index c84d40f7f7..39b2aef625 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -705,6 +705,7 @@ typedef struct vpx_svc_parameters {
   int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */
   int speed_per_layer[VPX_MAX_LAYERS];    /**< Speed setting for each sl */
   int temporal_layering_mode;             /**< Temporal layering mode */
+  int loopfilter_ctrl[VPX_MAX_LAYERS];    /**< Loopfilter ctrl for each sl */
 } vpx_svc_extra_cfg_t;
 
 /*!\brief Initialize an encoder instance

From d4453c73ff9467d5c7cd4ce8b6070dbfa24eff37 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Fri, 13 Nov 2020 18:13:14 -0800
Subject: [PATCH 009/926] Fix the warning of C90 mixed declarations and code

Change-Id: I1a6c57525bbe8bf1a97057ecd64985bc23d1df2e
---
 vp9/encoder/vp9_encoder.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f4587d42d9..8d60a0c001 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -7383,8 +7383,6 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
   // Accumulate tpl stats for each frame in the current group of picture.
   for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
     TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    if (!tpl_frame->is_valid) continue;
-
     TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
     const int tpl_stride = tpl_frame->stride;
     int64_t intra_cost_base = 0;
@@ -7394,6 +7392,8 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
     int64_t mc_flow_base = 0;
     int row, col;
 
+    if (!tpl_frame->is_valid) continue;
+
     for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
       for (col = 0; col < cm->mi_cols; ++col) {
         TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];

From 4e7fd0273ade37a1f128ad131ed1d6a263f13d22 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Fri, 13 Nov 2020 18:17:48 -0800
Subject: [PATCH 010/926] Fix uninitialized warning in resize_test.cc

Change-Id: I12a72d3aa57b13dbcbeb037e1deea41529ea4194
---
 test/resize_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index 65b94fa4f6..c57170ff9b 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -271,8 +271,8 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
  protected:
   virtual void Next() {
     ++frame_;
-    unsigned int width;
-    unsigned int height;
+    unsigned int width = 0;
+    unsigned int height = 0;
     ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
                         flag_codec_, smaller_width_larger_size_);
     SetSize(width, height);

From b5d77a48d740e211a130c8e45d9353ef8c154a47 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 16 Nov 2020 14:12:50 -0800
Subject: [PATCH 011/926] Remove condition on copying svc loopfilter flag

Change-Id: Ib37ef0aa3dc0ec73b25332be6d89969093bd7aeb
---
 vp9/encoder/vp9_svc_layercontext.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 9c75d77263..b6c7c74e17 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -357,8 +357,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) {
     cpi->oxcf.speed = lc->speed;
   }
-  if (lc->loopfilter_ctrl >= 0 || lc->loopfilter_ctrl < 3)
-    cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
+  cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1 ||

From ca7a16babc8bed02f060dd98f7297db7f3c90443 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Fri, 13 Nov 2020 18:32:34 -0800
Subject: [PATCH 012/926] Add doxygen to structs in vpx_ext_ratectrl.h

Bug: webm:1707

Change-Id: Ib5f6b6f143f55e5279e39eb386fcd3340211de59
---
 vpx/vpx_ext_ratectrl.h | 57 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 5d5a7c92d5..6919f2ac6f 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -17,35 +17,70 @@ extern "C" {
 
 #include "./vpx_integer.h"
 
-/*!\cond
-  TODO(angiebird): document these structures and fields to clear doxygen
-  warnings.*/
-
+/*!\brief Abstract rate control model handler
+ *
+ * The encoder will receive the model handler from create_model() defined in
+ * vpx_rc_funcs_t.
+ */
 typedef void *vpx_rc_model_t;
 
+/*!\brief Encode frame decision made by the external rate control model
+ *
+ * The encoder will receive the decision from the external rate control model
+ * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
+ */
 typedef struct vpx_rc_encodeframe_decision {
-  int q_index;
+  int q_index; /**< Quantizer step index [0..255]*/
 } vpx_rc_encodeframe_decision_t;
 
+/*!\brief Information for the frame to be encoded.
+ *
+ * The encoder will send the information to external rate control model through
+ * get_encodeframe_decision() defined in vpx_rc_funcs_t.
+ *
+ */
 typedef struct vpx_rc_encodeframe_info {
+  /*!
+   * 0: Key frame
+   * 1: Inter frame
+   * 2: Alternate reference frame
+   * 3: Overlay frame
+   * 4: Golden frame
+   */
   int frame_type;
-  int show_index;
-  int coding_index;
-  int ref_frame_coding_indexes[3];
+  int show_index;                  /**< display index, starts from zero*/
+  int coding_index;                /**< coding index, starts from zero*/
+  int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/
+  /*!
+   * The validity of the three reference frames.
+   * 0: Invalid
+   * 1: Valid
+   */
   int ref_frame_valid_list[3];
 } vpx_rc_encodeframe_info_t;
 
+/*!\brief Frame coding result
+ *
+ * The encoder will send the result to the external rate control model through
+ * update_encodeframe_result() defined in vpx_rc_funcs_t.
+ */
 typedef struct vpx_rc_encodeframe_result {
-  int64_t sse;
-  int64_t bit_count;
-  int64_t pixel_count;
+  int64_t sse;         /**< sum of squared error of the reconstructed frame */
+  int64_t bit_count;   /**< number of bits spent on coding the frame*/
+  int64_t pixel_count; /**< number of pixels in YUV planes of the frame*/
 } vpx_rc_encodeframe_result_t;
 
+/*!\brief Status returned by rate control callback functions.
+ */
 typedef enum vpx_rc_status {
   vpx_rc_ok = 0,
   vpx_rc_error = 1,
 } vpx_rc_status_t;
 
+/*!\cond
+  TODO(angiebird): document these structures and fields to clear doxygen
+  warnings.*/
+
 // This is a mirror of vp9's FIRSTPASS_STATS
 // Only spatial_layer_id is omitted
 typedef struct vpx_rc_frame_stats {

From c22a783bea8512a3413d9dd4abf82622cd89adcd Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Fri, 13 Nov 2020 19:26:14 -0800
Subject: [PATCH 013/926] Copy first pass stats documentation from AV1 to VP9

Bug: webm:1707
Change-Id: Iae7eaa9ba681272b70b6dad17cd2247edab6ef79
---
 vpx/vpx_ext_ratectrl.h | 120 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 114 insertions(+), 6 deletions(-)

diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 6919f2ac6f..494b149554 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -77,45 +77,153 @@ typedef enum vpx_rc_status {
   vpx_rc_error = 1,
 } vpx_rc_status_t;
 
-/*!\cond
-  TODO(angiebird): document these structures and fields to clear doxygen
-  warnings.*/
-
-// This is a mirror of vp9's FIRSTPASS_STATS
-// Only spatial_layer_id is omitted
+/*!\brief First pass frame stats
+ * This is a mirror of vp9's FIRSTPASS_STATS except that spatial_layer_id is
+ * omitted
+ */
 typedef struct vpx_rc_frame_stats {
+  /*!
+   * Frame number in display order, if stats are for a single frame.
+   * No real meaning for a collection of frames.
+   */
   double frame;
+  /*!
+   * Weight assigned to this frame (or total weight for the collection of
+   * frames) currently based on intra factor and brightness factor. This is used
+   * to distribute bits between easier and harder frames.
+   */
   double weight;
+  /*!
+   * Intra prediction error.
+   */
   double intra_error;
+  /*!
+   * Best of intra pred error and inter pred error using last frame as ref.
+   */
   double coded_error;
+  /*!
+   * Best of intra pred error and inter pred error using golden frame as ref.
+   */
   double sr_coded_error;
+  /*!
+   * Estimate the noise energy of the current frame.
+   */
   double frame_noise_energy;
+  /*!
+   * Percentage of blocks with inter pred error < intra pred error.
+   */
   double pcnt_inter;
+  /*!
+   * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+   */
   double pcnt_motion;
+  /*!
+   * Percentage of blocks where golden frame was better than last or intra:
+   * inter pred error using golden frame < inter pred error using last frame and
+   * inter pred error using golden frame < intra pred error
+   */
   double pcnt_second_ref;
+  /*!
+   * Percentage of blocks where intra and inter prediction errors were very
+   * close. Note that this is a 'weighted count', that is, the so blocks may be
+   * weighted by how close the two errors were.
+   */
   double pcnt_neutral;
+  /*!
+   * Percentage of blocks that have intra error < inter error and inter error <
+   * LOW_I_THRESH LOW_I_THRESH = 24000 using bit_depth 8 LOW_I_THRESH = 24000 <<
+   * 4 using bit_depth 10 LOW_I_THRESH = 24000 << 8 using bit_depth 12
+   */
   double pcnt_intra_low;
+  /*!
+   * Percentage of blocks that have intra error < inter error and intra error <
+   * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH = 24000 using
+   * bit_depth 8 LOW_I_THRESH = 24000 << 4 using bit_depth 10 LOW_I_THRESH =
+   * 24000 << 8 using bit_depth 12
+   */
   double pcnt_intra_high;
+  /*!
+   * Percentage of blocks that have almost no intra error residual
+   * (i.e. are in effect completely flat and untextured in the intra
+   * domain). In natural videos this is uncommon, but it is much more
+   * common in animations, graphics and screen content, so may be used
+   * as a signal to detect these types of content.
+   */
   double intra_skip_pct;
+  /*!
+   * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH
+   * SMOOTH_INTRA_THRESH = 4000 using bit_depth 8
+   * SMOOTH_INTRA_THRESH = 4000 << 4 using bit_depth 10
+   * SMOOTH_INTRA_THRESH = 4000 << 8 using bit_depth 12
+   */
   double intra_smooth_pct;
+  /*!
+   * Image mask rows top and bottom.
+   */
   double inactive_zone_rows;
+  /*!
+   * Image mask columns at left and right edges.
+   */
   double inactive_zone_cols;
+  /*!
+   * Average of row motion vectors.
+   */
   double MVr;
+  /*!
+   * Mean of absolute value of row motion vectors.
+   */
   double mvr_abs;
+  /*!
+   * Mean of column motion vectors.
+   */
   double MVc;
+  /*!
+   * Mean of absolute value of column motion vectors.
+   */
   double mvc_abs;
+  /*!
+   * Variance of row motion vectors.
+   */
   double MVrv;
+  /*!
+   * Variance of column motion vectors.
+   */
   double MVcv;
+  /*!
+   * Value in range [-1,1] indicating fraction of row and column motion vectors
+   * that point inwards (negative MV value) or outwards (positive MV value).
+   * For example, value of 1 indicates, all row/column MVs are inwards.
+   */
   double mv_in_out_count;
+  /*!
+   * Duration of the frame / collection of frames.
+   */
   double duration;
+  /*!
+   * 1.0 if stats are for a single frame, OR
+   * Number of frames in this collection for which the stats are accumulated.
+   */
   double count;
 } vpx_rc_frame_stats_t;
 
+/*!\brief Collection of first pass frame stats
+ */
 typedef struct vpx_rc_firstpass_stats {
+  /*!
+   * Pointer to first pass frame stats.
+   * The pointed array of vpx_rc_frame_stats_t should have length equal to
+   * number of show frames in the video.
+   */
   vpx_rc_frame_stats_t *frame_stats;
+  /*!
+   * Number of show frames in the video.
+   */
   int num_frames;
 } vpx_rc_firstpass_stats_t;
 
+/*!\cond
+  TODO(angiebird): document these structures and fields to clear doxygen
+  warnings.*/
 typedef struct vpx_rc_config {
   int frame_width;
   int frame_height;

From a7731ba488202ea62adfedf3fb49477cafe80b88 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Fri, 13 Nov 2020 19:40:13 -0800
Subject: [PATCH 014/926] Add doxygen for vpx_rc_config

Bug: webm:1707

Change-Id: I65bab6b2b792653e70cb136a5f9a21796e34b829
---
 vpx/vpx_ext_ratectrl.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 494b149554..bc0ed98191 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -221,16 +221,18 @@ typedef struct vpx_rc_firstpass_stats {
   int num_frames;
 } vpx_rc_firstpass_stats_t;
 
-/*!\cond
-  TODO(angiebird): document these structures and fields to clear doxygen
-  warnings.*/
+/*!\brief Encode config sent to external rate control model
+ */
 typedef struct vpx_rc_config {
-  int frame_width;
-  int frame_height;
-  int show_frame_count;
+  int frame_width;      /**< frame width */
+  int frame_height;     /**< frame height */
+  int show_frame_count; /**< number of visible frames in the video */
+  /*!
+   * Target bitrate in kilobytes per second
+   */
   int target_bitrate_kbps;
-  int frame_rate_num;
-  int frame_rate_den;
+  int frame_rate_num; /**< numerator of frame rate */
+  int frame_rate_den; /**< denominator of frame rate */
 } vpx_rc_config_t;
 
 /*!\brief Create an external rate control model callback prototype
@@ -295,6 +297,10 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
 typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)(
     vpx_rc_model_t rate_ctrl_model);
 
+/*!\cond
+  TODO(angiebird): document these structures and fields to clear doxygen
+  warnings.*/
+
 typedef struct vpx_rc_funcs {
   vpx_rc_create_model_cb_fn_t create_model;
   vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats;

From 275c2769933d599ae74002610563fe11321668bc Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 17 Nov 2020 14:37:20 -0800
Subject: [PATCH 015/926] Add doxygen for vpx_rc_funcs_t

Change-Id: If75215d574fe0b075add50154a9eece5d387741a
---
 vpx/vpx_ext_ratectrl.h | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index bc0ed98191..8aee4f4d82 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -297,23 +297,39 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
 typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)(
     vpx_rc_model_t rate_ctrl_model);
 
-/*!\cond
-  TODO(angiebird): document these structures and fields to clear doxygen
-  warnings.*/
-
+/*!\brief Callback function set for external rate control.
+ *
+ * The user can enable external rate control by registering
+ * a set of callback functions with the codec control flag
+ * VP9E_SET_EXTERNAL_RATE_CONTROL.
+ */
 typedef struct vpx_rc_funcs {
+  /*!
+   * Create an external rate control model.
+   */
   vpx_rc_create_model_cb_fn_t create_model;
+  /*!
+   * Send first pass stats to the external rate control model.
+   */
   vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats;
+  /*!
+   * Get encodeframe decision from the external rate control model.
+   */
   vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision;
+  /*!
+   * Update encodeframe result to the external rate control model.
+   */
   vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result;
+  /*!
+   * Delete the external rate control model.
+   */
   vpx_rc_delete_model_cb_fn_t delete_model;
+  /*!
+   * Private data for the external rate control model.
+   */
   void *priv;
 } vpx_rc_funcs_t;
 
-/*!\endcond
-  TODO(angiebird): document these structures and fields to clear doxygen
-  warnings.*/
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

From 5b63f0f821e94f8072eb483014cfc33b05978bb9 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 17 Nov 2020 15:30:55 -0800
Subject: [PATCH 016/926] Capitalize VPX_RC_OK / VPX_RC_ERROR

Change-Id: I526bd6a6c2d2095db564f96d63c7ab7ee4dd90ad
---
 test/vp9_ext_ratectrl_test.cc | 10 +++++-----
 vpx/vpx_ext_ratectrl.h        |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 8db0a358d0..01d8019969 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -44,7 +44,7 @@ vpx_rc_status_t rc_create_model(void *priv,
   EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000);
   EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
   EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
-  return vpx_rc_ok;
+  return VPX_RC_OK;
 }
 
 vpx_rc_status_t rc_send_firstpass_stats(
@@ -57,7 +57,7 @@ vpx_rc_status_t rc_send_firstpass_stats(
   for (int i = 0; i < first_pass_stats->num_frames; ++i) {
     EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
   }
-  return vpx_rc_ok;
+  return VPX_RC_OK;
 }
 
 vpx_rc_status_t rc_get_encodeframe_decision(
@@ -120,7 +120,7 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   } else {
     frame_decision->q_index = 100;
   }
-  return vpx_rc_ok;
+  return VPX_RC_OK;
 }
 
 vpx_rc_status_t rc_update_encodeframe_result(
@@ -135,14 +135,14 @@ vpx_rc_status_t rc_update_encodeframe_result(
   if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
     EXPECT_EQ(encode_frame_result->sse, 0);
   }
-  return vpx_rc_ok;
+  return VPX_RC_OK;
 }
 
 vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
   delete toy_rate_ctrl;
-  return vpx_rc_ok;
+  return VPX_RC_OK;
 }
 
 class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 8aee4f4d82..bb3caa6148 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -73,8 +73,8 @@ typedef struct vpx_rc_encodeframe_result {
 /*!\brief Status returned by rate control callback functions.
  */
 typedef enum vpx_rc_status {
-  vpx_rc_ok = 0,
-  vpx_rc_error = 1,
+  VPX_RC_OK = 0,
+  VPX_RC_ERROR = 1,
 } vpx_rc_status_t;
 
 /*!\brief First pass frame stats

From e56e8dcd6fc9e2b04316be5144c18ca6772f6263 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 17 Nov 2020 17:25:31 -0800
Subject: [PATCH 017/926] Add gop_index to vpx_ext_ratectrl.h

Bug: webm:1707

Change-Id: I48826d5f3a7cc292825a7f1e30ac6d0f57adc569
---
 test/vp9_ext_ratectrl_test.cc  |  5 +++++
 vp9/encoder/vp9_encoder.c      |  4 ++--
 vp9/encoder/vp9_ext_ratectrl.c |  3 ++-
 vp9/encoder/vp9_ext_ratectrl.h |  2 +-
 vpx/vpx_encoder.h              |  4 +++-
 vpx/vpx_ext_ratectrl.h         | 18 ++++++++++++++++--
 6 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 01d8019969..812a18ed22 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -73,6 +73,7 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
 
   if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               0);  // kRefFrameTypeLast
@@ -83,6 +84,7 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   }
 
   if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
     EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               1);  // kRefFrameTypeLast
@@ -96,10 +98,13 @@ vpx_rc_status_t rc_get_encodeframe_decision(
 
   if (encode_frame_info->coding_index >= 2 &&
       encode_frame_info->coding_index < 5) {
+    // In the first group of pictures, coding_index and gop_index are equal.
+    EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
     EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/);
   }
 
   if (encode_frame_info->coding_index == 5) {
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               1);  // kRefFrameTypeLast
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8d60a0c001..37f644501d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4511,8 +4511,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       get_ref_frame_bufs(cpi, ref_frame_bufs);
       vp9_extrc_get_encodeframe_decision(
           &cpi->ext_ratectrl, cm->current_video_frame,
-          cm->current_frame_coding_index, update_type, ref_frame_bufs,
-          ref_frame_flags, &encode_frame_decision);
+          cm->current_frame_coding_index, gf_group->index, update_type,
+          ref_frame_bufs, ref_frame_flags, &encode_frame_decision);
       q = encode_frame_decision.q_index;
     }
 
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 94c2addd25..a6a3e21d3b 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -103,7 +103,7 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
 }
 
 void vp9_extrc_get_encodeframe_decision(
-    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index,
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
     FRAME_UPDATE_TYPE update_type,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
@@ -111,6 +111,7 @@ void vp9_extrc_get_encodeframe_decision(
     vpx_rc_encodeframe_info_t encode_frame_info;
     encode_frame_info.show_index = show_index;
     encode_frame_info.coding_index = coding_index;
+    encode_frame_info.gop_index = gop_index;
     encode_frame_info.frame_type = extrc_get_frame_type(update_type);
 
     vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index fb6cfe1ac8..6a86218dac 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -33,7 +33,7 @@ void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl,
                                     const FIRST_PASS_INFO *first_pass_info);
 
 void vp9_extrc_get_encodeframe_decision(
-    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index,
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
     FRAME_UPDATE_TYPE update_type,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision);
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 39b2aef625..da36095775 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -30,6 +30,7 @@ extern "C" {
 #endif
 
 #include "./vpx_codec.h"
+#include "./vpx_ext_ratectrl.h"
 
 /*! Temporal Scalability: Maximum length of the sequence defining frame
  * layer membership
@@ -57,7 +58,8 @@ extern "C" {
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (14 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (14 + VPX_CODEC_ABI_VERSION + \
+   VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index bb3caa6148..5c57cf3319 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -17,6 +17,16 @@ extern "C" {
 
 #include "./vpx_integer.h"
 
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures.
+ */
+#define VPX_EXT_RATECTRL_ABI_VERSION (1)
+
 /*!\brief Abstract rate control model handler
  *
  * The encoder will receive the model handler from create_model() defined in
@@ -48,8 +58,12 @@ typedef struct vpx_rc_encodeframe_info {
    * 4: Golden frame
    */
   int frame_type;
-  int show_index;                  /**< display index, starts from zero*/
-  int coding_index;                /**< coding index, starts from zero*/
+  int show_index;   /**< display index, starts from zero*/
+  int coding_index; /**< coding index, starts from zero*/
+  /*!
+   * index in group of picture, starts from zero.
+   */
+  int gop_index;
   int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/
   /*!
    * The validity of the three reference frames.

From 2ccee3928d9bb2995dbf634ac6f9f172d4d86f3f Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Thu, 19 Nov 2020 19:55:33 -0800
Subject: [PATCH 018/926] Allow user to set rc_mode and cq_level in
 SimpleEncode

Change-Id: If3f56837e2c78a8b0fe7e0040f297c3f3ddb9c8b
---
 vp9/simple_encode.cc | 10 ++++++++++
 vp9/simple_encode.h  | 41 ++++++++++++++++++++++++++---------------
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index afda6e2035..d4eb0c669d 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -765,6 +765,16 @@ static void UpdateEncodeConfig(const EncodeConfig &config,
   SET_STRUCT_VALUE(config, oxcf, ret, encode_breakout);
   SET_STRUCT_VALUE(config, oxcf, ret, enable_tpl_model);
   SET_STRUCT_VALUE(config, oxcf, ret, enable_auto_arf);
+  if (strcmp(config.name, "rc_mode") == 0) {
+    int rc_mode = atoi(config.value);
+    if (rc_mode >= VPX_VBR && rc_mode <= VPX_Q) {
+      oxcf->rc_mode = (enum vpx_rc_mode)rc_mode;
+      ret = 1;
+    } else {
+      fprintf(stderr, "Invalid rc_mode value: %d\n", rc_mode);
+    }
+  }
+  SET_STRUCT_VALUE(config, oxcf, ret, cq_level);
   if (ret == 0) {
     fprintf(stderr, "Ignored unsupported encode_config %s\n", config.name);
   }
diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h
index 380e8118fc..e3ef3cea95 100644
--- a/vp9/simple_encode.h
+++ b/vp9/simple_encode.h
@@ -361,21 +361,32 @@ class SimpleEncode {
   // The following configs in VP9EncoderConfig are allowed to change in this
   // function. See https://ffmpeg.org/ffmpeg-codecs.html#libvpx for each
   // config's meaning.
-  // Configs in VP9EncoderConfig:      Equivalent configs in ffmpeg:
-  // 1  key_freq                       -g
-  // 2  two_pass_vbrmin_section        -minrate * 100LL / bit_rate
-  // 3  two_pass_vbrmax_section        -maxrate * 100LL / bit_rate
-  // 4  under_shoot_pct                -undershoot-pct
-  // 5  over_shoot_pct                 -overshoot-pct
-  // 6  max_threads                    -threads
-  // 7  frame_parallel_decoding_mode   -frame-parallel
-  // 8  tile_column                    -tile-columns
-  // 9  arnr_max_frames                -arnr-maxframes
-  // 10 arnr_strength                  -arnr-strength
-  // 11 lag_in_frames                  -rc_lookahead
-  // 12 encode_breakout                -static-thresh
-  // 13 enable_tpl_model               -enable-tpl
-  // 14 enable_auto_arf                -auto-alt-ref
+  // Configs in VP9EncoderConfig:          Equivalent configs in ffmpeg:
+  // 1  key_freq                           -g
+  // 2  two_pass_vbrmin_section            -minrate * 100LL / bit_rate
+  // 3  two_pass_vbrmax_section            -maxrate * 100LL / bit_rate
+  // 4  under_shoot_pct                    -undershoot-pct
+  // 5  over_shoot_pct                     -overshoot-pct
+  // 6  max_threads                        -threads
+  // 7  frame_parallel_decoding_mode       -frame-parallel
+  // 8  tile_column                        -tile-columns
+  // 9  arnr_max_frames                    -arnr-maxframes
+  // 10 arnr_strength                      -arnr-strength
+  // 11 lag_in_frames                      -rc_lookahead
+  // 12 encode_breakout                    -static-thresh
+  // 13 enable_tpl_model                   -enable-tpl
+  // 14 enable_auto_arf                    -auto-alt-ref
+  // 15 rc_mode
+  //    Possible Settings:
+  //      0 - Variable Bit Rate (VPX_VBR)  -b:v <bit_rate>
+  //      1 - Constant Bit Rate (VPX_CBR)  -b:v <bit_rate> -minrate <bit_rate>
+  //                                        -maxrate <bit_rate>
+  //        two_pass_vbrmin_section == 100   i.e. bit_rate == minrate == maxrate
+  //        two_pass_vbrmax_section == 100
+  //      2 - Constrained Quality (VPX_CQ) -crf <cq_level> -b:v bit_rate
+  //      3 - Constant Quality (VPX_Q)     -crf <cq_level> -b:v 0
+  //    See https://trac.ffmpeg.org/wiki/Encode/VP9 for more details.
+  // 16 cq_level                          see rc_mode for details.
   StatusCode SetEncodeConfig(const char *name, const char *value);
 
   // A debug function that dumps configs from VP9EncoderConfig

From c341440874f9f469e3861d905ea5f2f725b4f16b Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Fri, 20 Nov 2020 17:41:09 -0800
Subject: [PATCH 019/926] Refine documentation of vpx_ext_ratectrl.h

Bug: webm:1707
Change-Id: Iba04b5292c157e22dd8618a79e8c977ec9fc2199
---
 vpx/vpx_ext_ratectrl.h | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 5c57cf3319..dc4d856a8b 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -139,21 +139,23 @@ typedef struct vpx_rc_frame_stats {
   double pcnt_second_ref;
   /*!
    * Percentage of blocks where intra and inter prediction errors were very
-   * close. Note that this is a 'weighted count', that is, the so blocks may be
-   * weighted by how close the two errors were.
+   * close.
    */
   double pcnt_neutral;
   /*!
    * Percentage of blocks that have intra error < inter error and inter error <
-   * LOW_I_THRESH LOW_I_THRESH = 24000 using bit_depth 8 LOW_I_THRESH = 24000 <<
-   * 4 using bit_depth 10 LOW_I_THRESH = 24000 << 8 using bit_depth 12
+   * LOW_I_THRESH
+   * - bit_depth 8: LOW_I_THRESH = 24000
+   * - bit_depth 10: LOW_I_THRESH = 24000 << 4
+   * - bit_depth 12: LOW_I_THRESH = 24000 << 8
    */
   double pcnt_intra_low;
   /*!
    * Percentage of blocks that have intra error < inter error and intra error <
-   * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH = 24000 using
-   * bit_depth 8 LOW_I_THRESH = 24000 << 4 using bit_depth 10 LOW_I_THRESH =
-   * 24000 << 8 using bit_depth 12
+   * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH
+   * - bit_depth 8: LOW_I_THRESH = 24000
+   * - bit_depth 10: LOW_I_THRESH = 24000 << 4
+   * - bit_depth 12: LOW_I_THRESH = 24000 << 8
    */
   double pcnt_intra_high;
   /*!
@@ -166,9 +168,9 @@ typedef struct vpx_rc_frame_stats {
   double intra_skip_pct;
   /*!
    * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH
-   * SMOOTH_INTRA_THRESH = 4000 using bit_depth 8
-   * SMOOTH_INTRA_THRESH = 4000 << 4 using bit_depth 10
-   * SMOOTH_INTRA_THRESH = 4000 << 8 using bit_depth 12
+   * - bit_depth 8:  SMOOTH_INTRA_THRESH = 4000
+   * - bit_depth 10: SMOOTH_INTRA_THRESH = 4000 << 4
+   * - bit_depth 12: SMOOTH_INTRA_THRESH = 4000 << 8
    */
   double intra_smooth_pct;
   /*!
@@ -180,7 +182,7 @@ typedef struct vpx_rc_frame_stats {
    */
   double inactive_zone_cols;
   /*!
-   * Average of row motion vectors.
+   * Mean of row motion vectors.
    */
   double MVr;
   /*!
@@ -214,8 +216,8 @@ typedef struct vpx_rc_frame_stats {
    */
   double duration;
   /*!
-   * 1.0 if stats are for a single frame, OR
-   * Number of frames in this collection for which the stats are accumulated.
+   * 1.0 if stats are for a single frame, or
+   * number of frames whose stats are accumulated.
    */
   double count;
 } vpx_rc_frame_stats_t;

From 5459c4ab98c3d4377ab8f3379984c809ccf6e7eb Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 24 Nov 2020 02:55:24 +0000
Subject: [PATCH 020/926] Revert "Close out file in EndEncode()"

This reverts commit 7370cecd8929141adb8140b924d3dd8ac1887d36.

Reason for revert: I accidentally check in this CL

Change-Id: I71ff0b98649070df3edd13b98170a7091541057b
---
 vp9/simple_encode.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index ba076fd586..46b25d1fdf 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -959,10 +959,6 @@ void SimpleEncode::EndEncode() {
   impl_ptr_->cpi = nullptr;
   vpx_img_free(&impl_ptr_->tmp_img);
   rewind(in_file_);
-  if (out_file_ != nullptr) {
-    fclose(out_file_);
-    out_file_ = nullptr;
-  }
 }
 
 void SimpleEncode::UpdateKeyFrameGroup(int key_frame_show_index) {

From ebac57ce9250aac85701f5258ec54f9cf9bf14a8 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Wed, 25 Nov 2020 12:58:24 -0800
Subject: [PATCH 021/926] Fix typos in simple_encode.h

Change-Id: Id83eff6cc12c441ce991fb1a73820d106311cf5e
---
 vp9/simple_encode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h
index e3ef3cea95..8ec7069e83 100644
--- a/vp9/simple_encode.h
+++ b/vp9/simple_encode.h
@@ -281,7 +281,7 @@ struct EncodeFrameResult {
   std::vector<MotionVectorInfo> motion_vector_info;
   // A vector of the tpl stats information.
   // The tpl stats measure the complexity of a frame, as well as the
-  // informatioin propagated along the motion trajactory between frames, in
+  // information propagated along the motion trajectory between frames, in
   // the reference frame structure.
   // The tpl stats could be used as a more accurate spatial and temporal
   // complexity measure in addition to the first pass stats.

From ffc179d8bfb836d7b39aaf8595a9051b25a7b437 Mon Sep 17 00:00:00 2001
From: Jeremy Leconte <jleconte@google.com>
Date: Thu, 10 Dec 2020 17:54:54 +0100
Subject: [PATCH 022/926] Fix nullptr with offset.

The error occurs with low resolution when LibvpxVp8Encoder::NumberOfThreads returns 1.

Bug: b:175283098
Change-Id: Icc9387c75f4ac6e4f09f102b3143e83c998c5e38
---
 vp8/encoder/encodeframe.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 2b3d9564ce..2f84381d24 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -343,8 +343,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
   const int nsync = cpi->mt_sync_range;
   vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync);
   const vpx_atomic_int *last_row_current_mb_col;
-  vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  vpx_atomic_int *current_mb_col = NULL;
 
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
+    current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  }
   if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) {
     last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
   } else {

From 723fca7dd122c94e209864316e01535a23dc6d09 Mon Sep 17 00:00:00 2001
From: Gregor Jasny <gjasny@googlemail.com>
Date: Fri, 11 Dec 2020 08:00:07 +0100
Subject: [PATCH 023/926] configure: add darwin20 cross-compile support

Change-Id: I91c0e832a6e76172397e97413329fd43edc81c78
---
 build/make/configure.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 91a64b5041..c4e938fc72 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -919,8 +919,8 @@ process_common_toolchain() {
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
     *-darwin20-*)
-      add_cflags  "-mmacosx-version-min=10.16"
-      add_ldflags "-mmacosx-version-min=10.16"
+      add_cflags  "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}"
+      add_ldflags "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}"
       ;;
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"

From 8ed23d5f7fd1a003ecd1eb543f3a662772306155 Mon Sep 17 00:00:00 2001
From: Hui Su <huisu@google.com>
Date: Tue, 15 Dec 2020 22:40:09 -0800
Subject: [PATCH 024/926] First pass: skip motion search for intra-only

BUG=webm:1713

Change-Id: Ibad79cf5d12aa913e8c87a31d7d2124c00958691
---
 vp9/encoder/vp9_firstpass.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index de954f7575..2a9cf52898 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1081,8 +1081,8 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     x->mv_limits.col_max =
         ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
-    // Other than for the first frame do a motion search.
-    if (cm->current_video_frame > 0) {
+    // Other than for intra-only frame do a motion search.
+    if (!frame_is_intra_only(cm)) {
       int tmp_err, motion_error, this_motion_error, raw_motion_error;
       // Assume 0,0 motion with no mv overhead.
       MV mv = { 0, 0 }, tmp_mv = { 0, 0 };

From 67b1d7f1740158e795a26a722491424e6257c0dc Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Thu, 17 Dec 2020 17:32:26 -0800
Subject: [PATCH 025/926] Correct pixel_count in encode_frame_result

Change-Id: I3270af4f793f8e453e10d1caf8ffa1a8d5d584a7
---
 vp9/encoder/vp9_ext_ratectrl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index a6a3e21d3b..7d553a2ecd 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -134,8 +134,8 @@ void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl,
     vpx_rc_encodeframe_result_t encode_frame_result;
     encode_frame_result.bit_count = bit_count;
     encode_frame_result.pixel_count =
-        source_frame->y_width * source_frame->y_height +
-        2 * source_frame->uv_width * source_frame->uv_height;
+        source_frame->y_crop_width * source_frame->y_crop_height +
+        2 * source_frame->uv_crop_width * source_frame->uv_crop_height;
 #if CONFIG_VP9_HIGHBITDEPTH
     vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth,
                          input_bit_depth);

From 3a38edea2cd114d53914cab017cab2e43a600031 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Thu, 17 Dec 2020 18:09:55 -0800
Subject: [PATCH 026/926] Fix show_index in vp9_extrc_encodeframe_decision()

Change-Id: I93bb1fb3c14126d881d3f691d30875a0062e436c
---
 test/vp9_ext_ratectrl_test.cc | 3 +++
 vp9/encoder/vp9_encoder.c     | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 812a18ed22..4b3693a347 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -73,6 +73,7 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
 
   if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
@@ -84,6 +85,7 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   }
 
   if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 4);
     EXPECT_EQ(encode_frame_info->gop_index, 1);
     EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
@@ -104,6 +106,7 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   }
 
   if (encode_frame_info->coding_index == 5) {
+    EXPECT_EQ(encode_frame_info->show_index, 4);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 37f644501d..6968e57919 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4508,9 +4508,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
       const int ref_frame_flags = get_ref_frame_flags(cpi);
       RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+      const RefCntBuffer *curr_frame_buf =
+          get_ref_cnt_buffer(cm, cm->new_fb_idx);
       get_ref_frame_bufs(cpi, ref_frame_bufs);
       vp9_extrc_get_encodeframe_decision(
-          &cpi->ext_ratectrl, cm->current_video_frame,
+          &cpi->ext_ratectrl, curr_frame_buf->frame_index,
           cm->current_frame_coding_index, gf_group->index, update_type,
           ref_frame_bufs, ref_frame_flags, &encode_frame_decision);
       q = encode_frame_decision.q_index;

From 576e0801f9281fd54e2c69ad5be5fef7af656011 Mon Sep 17 00:00:00 2001
From: Hui Su <huisu@google.com>
Date: Wed, 13 Jan 2021 10:51:39 -0800
Subject: [PATCH 027/926] vpxenc: initalize the image object

Otherwise it would cause problem when calling vpx_img_free() at the end
if no frame is read.

Change-Id: Ide0ed28eeb142d65d04703442cc4f098ac8edb34
---
 vpxenc.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/vpxenc.c b/vpxenc.c
index 5d7546eb28..5042e688c9 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1636,6 +1636,7 @@ int main(int argc, const char **argv_) {
   int res = 0;
 
   memset(&input, 0, sizeof(input));
+  memset(&raw, 0, sizeof(raw));
   exec_name = argv_[0];
 
   /* Setup default input stream settings */
@@ -1781,14 +1782,10 @@ int main(int argc, const char **argv_) {
       FOREACH_STREAM(show_stream_config(stream, &global, &input));
 
     if (pass == (global.pass ? global.pass - 1 : 0)) {
-      if (input.file_type == FILE_TYPE_Y4M)
-        /*The Y4M reader does its own allocation.
-          Just initialize this here to avoid problems if we never read any
-           frames.*/
-        memset(&raw, 0, sizeof(raw));
-      else
+      // The Y4M reader does its own allocation.
+      if (input.file_type != FILE_TYPE_Y4M) {
         vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32);
-
+      }
       FOREACH_STREAM(stream->rate_hist = init_rate_histogram(
                          &stream->config.cfg, &global.framerate));
     }

From ecbb0e0e2a9b0500db432922b436d1f59ae9b011 Mon Sep 17 00:00:00 2001
From: Elliott Karpilovsky <elliottk@google.com>
Date: Thu, 14 Jan 2021 14:17:08 -0800
Subject: [PATCH 028/926] Relax constraints on Y4M header parsing

Some refactoring and cleanup -- do not count the first 9 bytes against
the header limit. Add a unit test.

BUG=aomedia:2876

Change-Id: Id897d565e2917b48460cc77cd082cec4c98b42cb
---
 test/y4m_test.cc |  25 +++
 y4minput.c       | 394 +++++++++++++++++++++++++----------------------
 y4minput.h       |  12 +-
 3 files changed, 241 insertions(+), 190 deletions(-)

diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 46cb5cff80..5df389f520 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -188,4 +188,29 @@ TEST_P(Y4mVideoWriteTest, WriteTest) {
 
 INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest,
                          ::testing::ValuesIn(kY4mTestVectors));
+
+static const char kY4MRegularHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, RegularHeader) {
+  libvpx_test::TempOutFile f;
+  fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(0, fseek(f.file(), 0, 0));
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+                           /*num_skip=*/0, /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
 }  // namespace
diff --git a/y4minput.c b/y4minput.c
index 007bd9971b..68000768c9 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -10,6 +10,7 @@
  *  Based on code from the OggTheora software codec source code,
  *  Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
  */
+#include <assert.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
@@ -784,277 +785,294 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
   (void)_aux;
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   int only_420) {
+static const char TAG[] = "YUV4MPEG2";
+
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, int only_420) {
+  // File must start with |TAG|.
+  char tag_buffer[9];  // 9 == strlen(TAG)
   char buffer[80] = { 0 };
   int ret;
   int i;
+  // Read as much as possible from |skip_buffer|, which were characters
+  // that were previously read from the file to do input-type detection.
+  assert(num_skip >= 0 && num_skip <= 8);
+  if (num_skip > 0) {
+    memcpy(tag_buffer, skip_buffer, num_skip);
+  }
+  // Start reading from the file now that the |skip_buffer| is depleted.
+  if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) {
+    return -1;
+  }
+  if (memcmp(TAG, tag_buffer, 9) != 0) {
+    fprintf(stderr, "Error parsing header: must start with %s\n", TAG);
+    return -1;
+  }
+  // Next character must be a space.
+  if (!file_read(buffer, 1, file) || buffer[0] != ' ') {
+    fprintf(stderr, "Error parsing header: space must follow %s\n", TAG);
+    return -1;
+  }
   /*Read until newline, or 80 cols, whichever happens first.*/
   for (i = 0; i < 79; i++) {
-    if (_nskip > 0) {
-      buffer[i] = *_skip++;
-      _nskip--;
-    } else {
-      if (!file_read(buffer + i, 1, _fin)) return -1;
-    }
+    if (!file_read(buffer + i, 1, file)) return -1;
     if (buffer[i] == '\n') break;
   }
   /*We skipped too much header data.*/
-  if (_nskip > 0) return -1;
   if (i == 79) {
-    fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
+    fprintf(stderr, "Error parsing header; not a YUV4MPEG2 file?\n");
     return -1;
   }
   buffer[i] = '\0';
-  if (memcmp(buffer, "YUV4MPEG", 8)) {
-    fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n");
-    return -1;
-  }
-  if (buffer[8] != '2') {
-    fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n");
-  }
-  ret = y4m_parse_tags(_y4m, buffer + 5);
+  ret = y4m_parse_tags(y4m_ctx, buffer);
   if (ret < 0) {
     fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
     return ret;
   }
-  if (_y4m->interlace == '?') {
+  if (y4m_ctx->interlace == '?') {
     fprintf(stderr,
             "Warning: Input video interlacing format unknown; "
             "assuming progressive scan.\n");
-  } else if (_y4m->interlace != 'p') {
+  } else if (y4m_ctx->interlace != 'p') {
     fprintf(stderr,
             "Input video is interlaced; "
             "Only progressive scan handled.\n");
     return -1;
   }
-  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
-  _y4m->bps = 12;
-  _y4m->bit_depth = 8;
-  if (strcmp(_y4m->chroma_type, "420") == 0 ||
-      strcmp(_y4m->chroma_type, "420jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        _y4m->pic_w * _y4m->pic_h +
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+  y4m_ctx->vpx_fmt = VPX_IMG_FMT_I420;
+  y4m_ctx->bps = 12;
+  y4m_ctx->bit_depth = 8;
+  y4m_ctx->aux_buf = NULL;
+  y4m_ctx->dst_buf = NULL;
+  if (strcmp(y4m_ctx->chroma_type, "420") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420jpeg") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        y4m_ctx->pic_w * y4m_ctx->pic_h +
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 10;
-    _y4m->bps = 15;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->bps = 15;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+  } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 12;
-    _y4m->bps = 18;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->bps = 18;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->convert = y4m_convert_42xmpeg2_42xjpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    y4m_ctx->aux_buf_sz =
+        3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422jpeg_420jpeg;
-  } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_422jpeg_420jpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_422_420jpeg;
+      y4m_ctx->aux_buf_read_sz =
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_422_420jpeg;
     } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
-      _y4m->bps = 16;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz =
-          _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      y4m_ctx->vpx_fmt = VPX_IMG_FMT_I422;
+      y4m_ctx->bps = 16;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz =
+          y4m_ctx->pic_w * y4m_ctx->pic_h +
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
-    _y4m->bps = 20;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216;
+    y4m_ctx->bps = 20;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
-    _y4m->bps = 24;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216;
+    y4m_ctx->bps = 24;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "411") == 0) {
-    _y4m->src_c_dec_h = 4;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) {
+    y4m_ctx->src_c_dec_h = 4;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
-    _y4m->aux_buf_sz =
-        _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_411_420jpeg;
+    y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz =
+        y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_411_420jpeg;
     fprintf(stderr, "Unsupported conversion from yuv 411\n");
     return -1;
-  } else if (strcmp(_y4m->chroma_type, "444") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
+  } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_444_420jpeg;
+      y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_444_420jpeg;
     } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
-      _y4m->bps = 24;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->vpx_fmt = VPX_IMG_FMT_I444;
+      y4m_ctx->bps = 24;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
-    _y4m->bps = 30;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416;
+    y4m_ctx->bps = 30;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
-    _y4m->bps = 36;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416;
+    y4m_ctx->bps = 36;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
-    _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
-    _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*No extra space required, but we need to clear the chroma planes.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_mono_420jpeg;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_mono_420jpeg;
   } else {
-    fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type);
+    fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type);
     return -1;
   }
   /*The size of the final frame buffers is always computed from the
      destination chroma decimation type.*/
-  _y4m->dst_buf_sz =
-      _y4m->pic_w * _y4m->pic_h +
-      2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
-          ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
-  if (_y4m->bit_depth == 8)
-    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  y4m_ctx->dst_buf_sz =
+      y4m_ctx->pic_w * y4m_ctx->pic_h +
+      2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) *
+          ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v);
+  if (y4m_ctx->bit_depth == 8)
+    y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz);
   else
-    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
+    y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz);
 
-  if (_y4m->aux_buf_sz > 0)
-    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+  if (y4m_ctx->aux_buf_sz > 0)
+    y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz);
   return 0;
 }
 
diff --git a/y4minput.h b/y4minput.h
index a4a8b18dc5..573750d749 100644
--- a/y4minput.h
+++ b/y4minput.h
@@ -56,8 +56,16 @@ struct y4m_input {
   unsigned int bit_depth;
 };
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   int only_420);
+/**
+ * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after
+ * reading it. The |skip_buffer| indicates bytes that were previously read
+ * from |file|, to do input-type detection; this buffer will be read before
+ * the |file| is read. It is of size |num_skip|, which *must* be 8 or less.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);
 

From fe1c96d1113ad73370841f64913dfcd361ff9bf5 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 19 Jan 2021 18:38:23 -0800
Subject: [PATCH 029/926] {highbd_,}loopfilter_neon.c: quiet
 -Wmaybe-uninitialized

Seen with arm-linux-gnueabihf-gcc-8 (8.3.0 & 8.4.0)

Without reworking the code or adding an additional branch this warning
cannot be silenced otherwise. The loopfilter is only called when needed
for a block so these output pixels will be set.

BUG=b/176822719

Change-Id: I9cf6e59bd5de901e168867ccbe021d28d0c04933
---
 vpx_dsp/arm/highbd_loopfilter_neon.c | 15 +++++++++++++++
 vpx_dsp/arm/loopfilter_neon.c        | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/vpx_dsp/arm/highbd_loopfilter_neon.c b/vpx_dsp/arm/highbd_loopfilter_neon.c
index 5530c6425b..8d6e8acc4c 100644
--- a/vpx_dsp/arm/highbd_loopfilter_neon.c
+++ b/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -661,6 +661,17 @@ void vpx_highbd_lpf_vertical_8_dual_neon(
   vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
 }
 
+// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 static void lpf_horizontal_16_kernel(uint16_t *s, int p,
                                      const uint16x8_t blimit_vec,
                                      const uint16x8_t limit_vec,
@@ -723,6 +734,10 @@ static void lpf_vertical_16_kernel(uint16_t *s, int p,
   }
 }
 
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index 7419cea022..c54e588239 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -975,6 +975,17 @@ FUN_LPF_16_KERNEL(_, 8)        // lpf_16_kernel
 FUN_LPF_16_KERNEL(_dual_, 16)  // lpf_16_dual_kernel
 #undef FUN_LPF_16_KERNEL
 
+// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
   uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
@@ -1090,3 +1101,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
               vget_high_u8(oq0), vget_high_u8(oq1));
   }
 }
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif

From f4fc562489bcee227403ca00f589e70043ebc5dc Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 19 Jan 2021 17:36:12 -0800
Subject: [PATCH 030/926] Return status in vp9_extrc_create/init/delete

Bug: webm:1716

Change-Id: I0b98741db8c639bdddd899fd6ad359da7b916086
---
 vp9/encoder/vp9_ext_ratectrl.c | 43 +++++++++++++++++++++++++++-------
 vp9/encoder/vp9_ext_ratectrl.h |  9 +++----
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 7d553a2ecd..d93abd60d3 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -13,31 +13,56 @@
 #include "vp9/common/vp9_common.h"
 #include "vpx_dsp/psnr.h"
 
-void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { vp9_zero(*ext_ratectrl); }
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_ERROR;
+  }
+  vp9_zero(*ext_ratectrl);
+  return VPX_CODEC_OK;
+}
 
-void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config,
-                      EXT_RATECTRL *ext_ratectrl) {
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+                                 vpx_rc_config_t ratectrl_config,
+                                 EXT_RATECTRL *ext_ratectrl) {
+  vpx_rc_status_t rc_status;
   vpx_rc_firstpass_stats_t *rc_firstpass_stats;
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_ERROR;
+  }
   vp9_extrc_delete(ext_ratectrl);
   ext_ratectrl->funcs = funcs;
   ext_ratectrl->ratectrl_config = ratectrl_config;
-  ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv,
-                                   &ext_ratectrl->ratectrl_config,
-                                   &ext_ratectrl->model);
+  rc_status = ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv,
+                                               &ext_ratectrl->ratectrl_config,
+                                               &ext_ratectrl->model);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
   rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats;
   rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count;
   rc_firstpass_stats->frame_stats =
       vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) *
                  rc_firstpass_stats->num_frames);
+  if (rc_firstpass_stats->frame_stats == NULL) {
+    return VPX_CODEC_MEM_ERROR;
+  }
   ext_ratectrl->ready = 1;
+  return VPX_CODEC_OK;
 }
 
-void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) {
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_ERROR;
+  }
   if (ext_ratectrl->ready) {
-    ext_ratectrl->funcs.delete_model(ext_ratectrl->model);
+    vpx_rc_status_t rc_status =
+        ext_ratectrl->funcs.delete_model(ext_ratectrl->model);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
     vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats);
   }
-  vp9_extrc_init(ext_ratectrl);
+  return vp9_extrc_init(ext_ratectrl);
 }
 
 static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 6a86218dac..e4d56c0b22 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -22,12 +22,13 @@ typedef struct EXT_RATECTRL {
   vpx_rc_firstpass_stats_t rc_firstpass_stats;
 } EXT_RATECTRL;
 
-void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl);
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl);
 
-void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config,
-                      EXT_RATECTRL *ext_ratectrl);
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+                                 vpx_rc_config_t ratectrl_config,
+                                 EXT_RATECTRL *ext_ratectrl);
 
-void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
 
 void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl,
                                     const FIRST_PASS_INFO *first_pass_info);

From 27f1838519ea1354bb8a038ec4e9d2c6da0da994 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 19 Jan 2021 17:48:07 -0800
Subject: [PATCH 031/926] Return status in vp9_extrc_send_firstpass_stats

Bug: webm:1716

Change-Id: I96b18436c58ed888fcf677097819cc0093b6f41d
---
 vp9/encoder/vp9_ext_ratectrl.c | 16 ++++++++++++----
 vp9/encoder/vp9_ext_ratectrl.h |  4 ++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index d93abd60d3..4a2e1b82dc 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -94,9 +94,13 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
   rc_frame_stats->count = stats->count;
 }
 
-void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl,
-                                    const FIRST_PASS_INFO *first_pass_info) {
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+    EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_ERROR;
+  }
   if (ext_ratectrl->ready) {
+    vpx_rc_status_t rc_status;
     vpx_rc_firstpass_stats_t *rc_firstpass_stats =
         &ext_ratectrl->rc_firstpass_stats;
     int i;
@@ -105,9 +109,13 @@ void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl,
       gen_rc_firstpass_stats(&first_pass_info->stats[i],
                              &rc_firstpass_stats->frame_stats[i]);
     }
-    ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model,
-                                             rc_firstpass_stats);
+    rc_status = ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model,
+                                                         rc_firstpass_stats);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
   }
+  return VPX_CODEC_OK;
 }
 
 static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index e4d56c0b22..fbb5ebf05d 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -30,8 +30,8 @@ vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
 
 vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
 
-void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl,
-                                    const FIRST_PASS_INFO *first_pass_info);
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+    EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
 
 void vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,

From d890579a2ec8a8a36b1e75b3fe6662faa99608e5 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 19 Jan 2021 17:57:00 -0800
Subject: [PATCH 032/926] Add status in vp9_extrc_get_encodeframe_decision

Bug: webm:1716
Change-Id: Ie6d63a68539369c51fefefa528e299b00a967e29
---
 vp9/encoder/vp9_ext_ratectrl.c | 12 ++++++++++--
 vp9/encoder/vp9_ext_ratectrl.h |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 4a2e1b82dc..ec6e198af0 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -135,12 +135,16 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
   }
 }
 
-void vp9_extrc_get_encodeframe_decision(
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
     FRAME_UPDATE_TYPE update_type,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_ERROR;
+  }
   if (ext_ratectrl->ready) {
+    vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_info_t encode_frame_info;
     encode_frame_info.show_index = show_index;
     encode_frame_info.coding_index = coding_index;
@@ -151,9 +155,13 @@ void vp9_extrc_get_encodeframe_decision(
                            encode_frame_info.ref_frame_coding_indexes,
                            encode_frame_info.ref_frame_valid_list);
 
-    ext_ratectrl->funcs.get_encodeframe_decision(
+    rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
         ext_ratectrl->model, &encode_frame_info, encode_frame_decision);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
   }
+  return VPX_CODEC_OK;
 }
 
 void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl,
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index fbb5ebf05d..2082cd530e 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -33,7 +33,7 @@ vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
 vpx_codec_err_t vp9_extrc_send_firstpass_stats(
     EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
 
-void vp9_extrc_get_encodeframe_decision(
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
     FRAME_UPDATE_TYPE update_type,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,

From d49700e25b280eba27ab3804cc769aed68ffb3e5 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 19 Jan 2021 18:18:20 -0800
Subject: [PATCH 033/926] Add return to vp9_extrc_update_encodeframe_result

Bug: webm:1716
Change-Id: Ib016ab5a49c765971366cc8d2b75bcca3ed5bd0f
---
 vp9/encoder/vp9_ext_ratectrl.c | 23 +++++++++++++++--------
 vp9/encoder/vp9_ext_ratectrl.h | 11 +++++------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index ec6e198af0..70d6dd9c22 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -164,14 +164,17 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
   return VPX_CODEC_OK;
 }
 
-void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl,
-                                         int64_t bit_count,
-                                         const YV12_BUFFER_CONFIG *source_frame,
-                                         const YV12_BUFFER_CONFIG *coded_frame,
-                                         uint32_t bit_depth,
-                                         uint32_t input_bit_depth) {
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+    EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
+    const YV12_BUFFER_CONFIG *source_frame,
+    const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
+    uint32_t input_bit_depth) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_ERROR;
+  }
   if (ext_ratectrl->ready) {
     PSNR_STATS psnr;
+    vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_result_t encode_frame_result;
     encode_frame_result.bit_count = bit_count;
     encode_frame_result.pixel_count =
@@ -186,7 +189,11 @@ void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl,
     vpx_calc_psnr(source_frame, coded_frame, &psnr);
 #endif
     encode_frame_result.sse = psnr.sse[0];
-    ext_ratectrl->funcs.update_encodeframe_result(ext_ratectrl->model,
-                                                  &encode_frame_result);
+    rc_status = ext_ratectrl->funcs.update_encodeframe_result(
+        ext_ratectrl->model, &encode_frame_result);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
   }
+  return VPX_CODEC_OK;
 }
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 2082cd530e..11e9102a65 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -39,11 +39,10 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision);
 
-void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl,
-                                         int64_t bit_count,
-                                         const YV12_BUFFER_CONFIG *source_frame,
-                                         const YV12_BUFFER_CONFIG *coded_frame,
-                                         uint32_t bit_depth,
-                                         uint32_t input_bit_depth);
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+    EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
+    const YV12_BUFFER_CONFIG *source_frame,
+    const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
+    uint32_t input_bit_depth);
 
 #endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_

From f57fa3f1df45ea80049ff831a054ac66a12aebdc Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 19 Jan 2021 18:45:38 -0800
Subject: [PATCH 034/926] Handle vp9_extrc functions' return status properly

Bug: webm:1716
Change-Id: I204cd3ab35b493759808500b799da3b9e55686d4
---
 vp9/encoder/vp9_encoder.c | 29 ++++++++++++++++++++++++-----
 vp9/vp9_cx_iface.c        |  6 +++++-
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6968e57919..eea2f18400 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2464,7 +2464,12 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
-  vp9_extrc_init(&cpi->ext_ratectrl);
+  {
+    vpx_codec_err_t codec_status = vp9_extrc_init(&cpi->ext_ratectrl);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status, "vp9_extrc_init() failed");
+    }
+  }
 
 #if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 1) {
@@ -4503,6 +4508,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 #endif
     if (cpi->ext_ratectrl.ready) {
+      vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
       FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
@@ -4511,10 +4517,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       const RefCntBuffer *curr_frame_buf =
           get_ref_cnt_buffer(cm, cm->new_fb_idx);
       get_ref_frame_bufs(cpi, ref_frame_bufs);
-      vp9_extrc_get_encodeframe_decision(
+      codec_status = vp9_extrc_get_encodeframe_decision(
           &cpi->ext_ratectrl, curr_frame_buf->frame_index,
           cm->current_frame_coding_index, gf_group->index, update_type,
           ref_frame_bufs, ref_frame_flags, &encode_frame_decision);
+      if (codec_status != VPX_CODEC_OK) {
+        vpx_internal_error(&cm->error, codec_status,
+                           "vp9_extrc_get_encodeframe_decision() failed");
+      }
       q = encode_frame_decision.q_index;
     }
 
@@ -5489,9 +5499,13 @@ static void encode_frame_to_data_rate(
   {
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
-    vp9_extrc_update_encodeframe_result(
+    vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
         &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf,
         cm->bit_depth, cpi->oxcf.input_bit_depth);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_update_encodeframe_result() failed");
+    }
   }
 #if CONFIG_REALTIME_ONLY
   (void)encode_frame_result;
@@ -5682,8 +5696,13 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
   if (cpi->common.current_frame_coding_index == 0) {
-    vp9_extrc_send_firstpass_stats(&cpi->ext_ratectrl,
-                                   &cpi->twopass.first_pass_info);
+    VP9_COMMON *cm = &cpi->common;
+    const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
+        &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_send_firstpass_stats() failed");
+    }
   }
 #if CONFIG_MISMATCH_DEBUG
   mismatch_move_frame_idx_w();
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index a73683dfe0..ecfacfaf43 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1744,6 +1744,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
   if (oxcf->pass == 2) {
     const FRAME_INFO *frame_info = &cpi->frame_info;
     vpx_rc_config_t ratectrl_config;
+    vpx_codec_err_t codec_status;
 
     ratectrl_config.frame_width = frame_info->frame_width;
     ratectrl_config.frame_height = frame_info->frame_height;
@@ -1755,7 +1756,10 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
     ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
     ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
 
-    vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
+    codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
+    if (codec_status != VPX_CODEC_OK) {
+      return codec_status;
+    }
   }
   return VPX_CODEC_OK;
 }

From b0050f27e27c59312a12c037b263a1bb71df4f3c Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Tue, 19 Jan 2021 18:56:48 -0800
Subject: [PATCH 035/926] Use VPX_CODEC_INVALID_PARAM when ext_ratectrl=NULL

Bug: webm:1716

Change-Id: Ic60c367aabfc03d94816e85476895b988aced5f1
---
 vp9/encoder/vp9_ext_ratectrl.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 70d6dd9c22..a27eb653ba 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -15,7 +15,7 @@
 
 vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
   if (ext_ratectrl == NULL) {
-    return VPX_CODEC_ERROR;
+    return VPX_CODEC_INVALID_PARAM;
   }
   vp9_zero(*ext_ratectrl);
   return VPX_CODEC_OK;
@@ -27,7 +27,7 @@ vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
   vpx_rc_status_t rc_status;
   vpx_rc_firstpass_stats_t *rc_firstpass_stats;
   if (ext_ratectrl == NULL) {
-    return VPX_CODEC_ERROR;
+    return VPX_CODEC_INVALID_PARAM;
   }
   vp9_extrc_delete(ext_ratectrl);
   ext_ratectrl->funcs = funcs;
@@ -52,7 +52,7 @@ vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
 
 vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) {
   if (ext_ratectrl == NULL) {
-    return VPX_CODEC_ERROR;
+    return VPX_CODEC_INVALID_PARAM;
   }
   if (ext_ratectrl->ready) {
     vpx_rc_status_t rc_status =
@@ -97,7 +97,7 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
 vpx_codec_err_t vp9_extrc_send_firstpass_stats(
     EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) {
   if (ext_ratectrl == NULL) {
-    return VPX_CODEC_ERROR;
+    return VPX_CODEC_INVALID_PARAM;
   }
   if (ext_ratectrl->ready) {
     vpx_rc_status_t rc_status;
@@ -141,7 +141,7 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
   if (ext_ratectrl == NULL) {
-    return VPX_CODEC_ERROR;
+    return VPX_CODEC_INVALID_PARAM;
   }
   if (ext_ratectrl->ready) {
     vpx_rc_status_t rc_status;
@@ -170,7 +170,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
     uint32_t input_bit_depth) {
   if (ext_ratectrl == NULL) {
-    return VPX_CODEC_ERROR;
+    return VPX_CODEC_INVALID_PARAM;
   }
   if (ext_ratectrl->ready) {
     PSNR_STATS psnr;

From 7b93b56ab9b27f3c2a72e05b7ea3b5e85a06f5fa Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 21 Jan 2021 13:07:20 -0800
Subject: [PATCH 036/926] Do not reuse mv in base spatial layer if curr buf
 same as prev.

Bug: b/154890543
Change-Id: Iad5791912f781d225e610a61bc13f3dbaef81bb9
---
 vp9/encoder/vp9_encoder.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f4587d42d9..4823d5f0f1 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -7861,9 +7861,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   cm->new_fb_idx = get_free_fb(cm);
 
   if (cm->new_fb_idx == INVALID_IDX) return -1;
-
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
+  // If the frame buffer for current frame is the same as previous frame, MV in
+  // the base layer shouldn't be used as it'll cause data race.
+  if (cm->cur_frame == cm->prev_frame) {
+    cpi->svc.use_base_mv = 0;
+  }
   // Start with a 0 size frame.
   *size = 0;
 

From bd8dfea54d10adb2c0b19ccdaa6891757b1e5ae0 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Jan 2021 18:03:17 -0800
Subject: [PATCH 037/926] sad_test: fix compilation w/gcc 4.8.5

use a #define for kDataAlignment as it's used with DECLARE_ALIGNED
(__attribute__((aligned(n)))) and this version under CentOS is more
strict over integer constants:

../vpx_ports/mem.h:18:72: error: requested alignment is not an integer constant
 #define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))

Bug: webm:1690
Change-Id: I8d4661ec1c2c1b1522bdc210689715d2302c7e72
---
 test/sad_test.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 34cb26ed11..ee10a46389 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -26,6 +26,10 @@
 #include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
 
+// const[expr] should be sufficient for DECLARE_ALIGNED but early
+// implementations of c++11 appear to have some issues with it.
+#define kDataAlignment 32
+
 template <typename Function>
 struct TestParams {
   TestParams(int w, int h, Function f, int bd = -1)
@@ -117,9 +121,6 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   // crbug.com/webm/1660
-  // const[expr] should be sufficient for DECLARE_ALIGNED but early
-  // implementations of c++11 appear to have some issues with it.
-  enum { kDataAlignment = 32 };
   static const int kDataBlockSize = 64 * 128;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 

From 987dd3a9be85a4221733fd4a320eabce7463e56a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Jan 2021 18:05:47 -0800
Subject: [PATCH 038/926] vp9_end_to_end_test: fix compile with gcc 4.8.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

use Values() rather than ValuesIn() with an initializer list as this
version of gcc under CentOS fails to deduce the type:

../third_party/googletest/src/include/gtest/gtest-param-test.h:304:29:
note:   template argument deduction/substitution failed:
../test/vp9_end_to_end_test.cc:346:59: note:   couldn't deduce template
parameter ‘T’
                            ::testing::ValuesIn({ 6, 7, 8 }));

Bug: webm:1690
Change-Id: I43d9d4777fcd74a4f8fa8bdcd9834cdca5e546ff
---
 test/vp9_end_to_end_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc
index 4e3a78fac4..7cc126ea58 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -342,7 +342,7 @@ VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLarge,
 VP9_INSTANTIATE_TEST_SUITE(EndToEndNV12,
                            ::testing::Values(::libvpx_test::kRealTime),
                            ::testing::ValuesIn(kTestVectorsNv12),
-                           ::testing::ValuesIn({ 6, 7, 8 }));
+                           ::testing::Values(6, 7, 8));
 
 VP9_INSTANTIATE_TEST_SUITE(EndToEndTestAdaptiveRDThresh,
                            ::testing::Values(5, 6, 7), ::testing::Values(8, 9));

From f46b66ac83279e5403091d307cd3be7d97059949 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 27 Jan 2021 09:41:18 -0800
Subject: [PATCH 039/926] svc: turn off use_base_mv on non base layer.

Change-Id: I4a9402f468e54c58081c882ed37f59ee0269c0fc
---
 vp9/encoder/vp9_encoder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 4823d5f0f1..4750f5b7bc 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -7864,7 +7864,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
   // If the frame buffer for current frame is the same as previous frame, MV in
   // the base layer shouldn't be used as it'll cause data race.
-  if (cm->cur_frame == cm->prev_frame) {
+  if (cpi->svc.spatial_layer_id > 0 && cm->cur_frame == cm->prev_frame) {
     cpi->svc.use_base_mv = 0;
   }
   // Start with a 0 size frame.

From ebb5ffc1d462c70dfb2283a5c7afcb75288c7692 Mon Sep 17 00:00:00 2001
From: Elliott Karpilovsky <elliottk@google.com>
Date: Thu, 28 Jan 2021 11:22:58 -0800
Subject: [PATCH 040/926] Relax constraints on Y4M header parsing

Previous parser assumed that the header would not exceed
80 characters. However, with latest FFMPEG changes, the header
of Y4M files can exceed this limit.

New parser can parse up to ~200 characters. Arbitrary parsing in
future commit.

BUG=aomedia:2876

Change-Id: I2ab8a7930cb5b76004e6731321d0ea20ddf333c1
---
 test/y4m_test.cc | 26 +++++++++++++++++
 y4minput.c       | 76 ++++++++++++++++++++++++++++--------------------
 2 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 5df389f520..8272263f66 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -213,4 +213,30 @@ TEST(Y4MHeaderTest, RegularHeader) {
   y4m_input_close(&y4m);
 }
 
+// Testing that headers over 100 characters can be parsed.
+static const char kY4MLongHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG "
+    "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, LongHeader) {
+  libvpx_test::TempOutFile f;
+  fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(fseek(f.file(), 0, 0), 0);
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+                           /*num_skip=*/0, /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
 }  // namespace
diff --git a/y4minput.c b/y4minput.c
index 68000768c9..1983021a1e 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -52,15 +52,8 @@ static int file_read(void *buf, size_t size, FILE *file) {
 }
 
 static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
-  int got_w;
-  int got_h;
-  int got_fps;
-  int got_interlace;
-  int got_par;
-  int got_chroma;
   char *p;
   char *q;
-  got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
   for (p = _tags;; p = q) {
     /*Skip any leading spaces.*/
     while (*p == ' ') p++;
@@ -73,52 +66,74 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
     switch (p[0]) {
       case 'W': {
         if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
-        got_w = 1;
         break;
       }
       case 'H': {
         if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
-        got_h = 1;
         break;
       }
       case 'F': {
         if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
           return -1;
         }
-        got_fps = 1;
         break;
       }
       case 'I': {
         _y4m->interlace = p[1];
-        got_interlace = 1;
         break;
       }
       case 'A': {
         if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
           return -1;
         }
-        got_par = 1;
         break;
       }
       case 'C': {
         if (q - p > 16) return -1;
         memcpy(_y4m->chroma_type, p + 1, q - p - 1);
         _y4m->chroma_type[q - p - 1] = '\0';
-        got_chroma = 1;
         break;
       }
         /*Ignore unknown tags.*/
     }
   }
-  if (!got_w || !got_h || !got_fps) return -1;
-  if (!got_interlace) _y4m->interlace = '?';
-  if (!got_par) _y4m->par_n = _y4m->par_d = 0;
-  /*Chroma-type is not specified in older files, e.g., those generated by
-     mplayer.*/
-  if (!got_chroma) strcpy(_y4m->chroma_type, "420");
   return 0;
 }
 
+/* Returns 1 if tags were parsed successfully, 0 otherwise. */
+static int parse_tags(y4m_input *y4m_ctx, char *buffer) {
+  /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory
+     fields are marked with -1 and will be checked after the tags are parsed. */
+  int ret;
+  y4m_ctx->pic_w = -1;
+  y4m_ctx->pic_h = -1;
+  y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */
+  y4m_ctx->par_n = 0;
+  y4m_ctx->par_d = 0;
+  y4m_ctx->interlace = '?';
+  snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420");
+
+  ret = y4m_parse_tags(y4m_ctx, buffer);
+  if (ret < 0) {
+    return 0;
+  }
+
+  /* Check the mandatory fields. */
+  if (y4m_ctx->pic_w == -1) {
+    fprintf(stderr, "Width field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->pic_h == -1) {
+    fprintf(stderr, "Height field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->fps_n == -1) {
+    fprintf(stderr, "FPS field missing\n");
+    return 0;
+  }
+  return 1;
+}
+
 /*All anti-aliasing filters in the following conversion functions are based on
    one of two window functions:
   The 6-tap Lanczos window (for down-sampling and shifts):
@@ -786,13 +801,14 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
 }
 
 static const char TAG[] = "YUV4MPEG2";
+/* Temporary until arbitrary header parsing submitted. */
+#define Y4M_HEADER_BUF_SIZE 200
 
 int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
                    int num_skip, int only_420) {
   // File must start with |TAG|.
-  char tag_buffer[9];  // 9 == strlen(TAG)
-  char buffer[80] = { 0 };
-  int ret;
+  char tag_buffer[9];                        // 9 == strlen(TAG)
+  char buffer[Y4M_HEADER_BUF_SIZE] = { 0 };  // Rest of header.
   int i;
   // Read as much as possible from |skip_buffer|, which were characters
   // that were previously read from the file to do input-type detection.
@@ -813,21 +829,19 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
     fprintf(stderr, "Error parsing header: space must follow %s\n", TAG);
     return -1;
   }
-  /*Read until newline, or 80 cols, whichever happens first.*/
-  for (i = 0; i < 79; i++) {
+  /*Read until newline, or Y4M_HEADER_BUF_SIZE cols, whichever happens first.*/
+  for (i = 0; i < Y4M_HEADER_BUF_SIZE - 1; i++) {
     if (!file_read(buffer + i, 1, file)) return -1;
     if (buffer[i] == '\n') break;
   }
-  /*We skipped too much header data.*/
-  if (i == 79) {
-    fprintf(stderr, "Error parsing header; not a YUV4MPEG2 file?\n");
+  if (i == Y4M_HEADER_BUF_SIZE - 1) {
+    fprintf(stderr, "Error parsing header; not a %s file?\n", TAG);
     return -1;
   }
   buffer[i] = '\0';
-  ret = y4m_parse_tags(y4m_ctx, buffer);
-  if (ret < 0) {
-    fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
-    return ret;
+  if (!parse_tags(y4m_ctx, buffer)) {
+    fprintf(stderr, "Error parsing %s header.\n", TAG);
+    return -1;
   }
   if (y4m_ctx->interlace == '?') {
     fprintf(stderr,

From 61edec1efbea1c02d71857e2aff9426d9cd2df4e Mon Sep 17 00:00:00 2001
From: Elliott Karpilovsky <elliottk@google.com>
Date: Fri, 29 Jan 2021 09:37:31 -0800
Subject: [PATCH 041/926] Relax constraints on Y4M header parsing

Previous parser assumed that the header would not exceed
80 characters. However, with latest FFMPEG changes, the header
of Y4M files can exceed this limit.

New parser can parse an arbitrarily long header, as long each
tag is 255 or less characters.

BUG=aomedia:2876

Change-Id: I9e6e42c50f4e49251dd697eef8036485ad5a1228
---
 y4minput.c | 78 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 24 deletions(-)

diff --git a/y4minput.c b/y4minput.c
index 1983021a1e..f923eda34a 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -100,11 +100,50 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
   return 0;
 }
 
+// Copy a single tag into the buffer, along with a null character.
+// Returns 0 if any file IO errors occur.
+static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) {
+  size_t i;
+  assert(buf_len >= 1);
+  // Skip leading space characters.
+  do {
+    if (!file_read(buf, 1, file)) {
+      return 0;
+    }
+  } while (buf[0] == ' ');
+
+  // If we hit the newline, treat this as the "empty" tag.
+  if (buf[0] == '\n') {
+    buf[0] = '\0';
+    *end_tag = '\n';
+    return 1;
+  }
+
+  // Copy over characters until a space is hit, or the buffer is exhausted.
+  for (i = 1; i < buf_len; ++i) {
+    if (!file_read(buf + i, 1, file)) {
+      return 0;
+    }
+    if (buf[i] == ' ' || buf[i] == '\n') {
+      break;
+    }
+  }
+  if (i == buf_len) {
+    fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n",
+            (unsigned long)i);
+    return 0;
+  }
+  *end_tag = buf[i];
+  buf[i] = '\0';
+  return 1;
+}
+
 /* Returns 1 if tags were parsed successfully, 0 otherwise. */
-static int parse_tags(y4m_input *y4m_ctx, char *buffer) {
+static int parse_tags(y4m_input *y4m_ctx, FILE *file) {
+  char tag[256];
+  char end; /* Character denoting the end of the tag, ' ' or '\n'. */
   /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory
      fields are marked with -1 and will be checked after the tags are parsed. */
-  int ret;
   y4m_ctx->pic_w = -1;
   y4m_ctx->pic_h = -1;
   y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */
@@ -113,10 +152,16 @@ static int parse_tags(y4m_input *y4m_ctx, char *buffer) {
   y4m_ctx->interlace = '?';
   snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420");
 
-  ret = y4m_parse_tags(y4m_ctx, buffer);
-  if (ret < 0) {
-    return 0;
-  }
+  /* Find one tag at a time. */
+  do {
+    if (!copy_tag(tag, sizeof(tag), &end, file)) {
+      return 0;
+    }
+    /* y4m_parse_tags returns 0 on success. */
+    if (y4m_parse_tags(y4m_ctx, tag)) {
+      return 0;
+    }
+  } while (end != '\n');
 
   /* Check the mandatory fields. */
   if (y4m_ctx->pic_w == -1) {
@@ -801,15 +846,11 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
 }
 
 static const char TAG[] = "YUV4MPEG2";
-/* Temporary until arbitrary header parsing submitted. */
-#define Y4M_HEADER_BUF_SIZE 200
 
 int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
                    int num_skip, int only_420) {
   // File must start with |TAG|.
-  char tag_buffer[9];                        // 9 == strlen(TAG)
-  char buffer[Y4M_HEADER_BUF_SIZE] = { 0 };  // Rest of header.
-  int i;
+  char tag_buffer[9];  // 9 == strlen(TAG)
   // Read as much as possible from |skip_buffer|, which were characters
   // that were previously read from the file to do input-type detection.
   assert(num_skip >= 0 && num_skip <= 8);
@@ -825,23 +866,12 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
     return -1;
   }
   // Next character must be a space.
-  if (!file_read(buffer, 1, file) || buffer[0] != ' ') {
+  if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') {
     fprintf(stderr, "Error parsing header: space must follow %s\n", TAG);
     return -1;
   }
-  /*Read until newline, or Y4M_HEADER_BUF_SIZE cols, whichever happens first.*/
-  for (i = 0; i < Y4M_HEADER_BUF_SIZE - 1; i++) {
-    if (!file_read(buffer + i, 1, file)) return -1;
-    if (buffer[i] == '\n') break;
-  }
-  if (i == Y4M_HEADER_BUF_SIZE - 1) {
-    fprintf(stderr, "Error parsing header; not a %s file?\n", TAG);
-    return -1;
-  }
-  buffer[i] = '\0';
-  if (!parse_tags(y4m_ctx, buffer)) {
+  if (!parse_tags(y4m_ctx, file)) {
     fprintf(stderr, "Error parsing %s header.\n", TAG);
-    return -1;
   }
   if (y4m_ctx->interlace == '?') {
     fprintf(stderr,

From 6c5377fd355f7ae76ddea43f0a5732aa4337d31b Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 3 Feb 2021 10:04:52 -0800
Subject: [PATCH 042/926] Fix to vpx_temporal_svc_encoder

Avoid division by zero.

Change-Id: Icf3f40aa32fe30f42c46417a1437ebe235e3ac96
---
 examples/vpx_temporal_svc_encoder.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index ffeae2abc4..04212e5d7d 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -930,6 +930,7 @@ int main(int argc, char **argv) {
           // Update for short-time encoding bitrate states, for moving window
           // of size rc->window, shifted by rc->window / 2.
           // Ignore first window segment, due to key frame.
+          if (rc.window_size == 0) rc.window_size = 15;
           if (frame_cnt > rc.window_size) {
             sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
             if (frame_cnt % rc.window_size == 0) {

From 557368a8fa9f839afd7a6ded6a95f18829ff3365 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 1 Feb 2021 16:57:09 -0800
Subject: [PATCH 043/926] L2E: let external rate control pass in a max frame
 size

And allow the frame to recode when the frame size is larger
than the input max frame size.

If the max frame size is not specified, let vp9 decide whether
to recode.  The recode follows the vp9's current recoding mechanism.

The rate control api will return the new qindex back to the
external model.

Change-Id: I796fbf713ad50a5b413b0e2501583b565ed2343f
---
 test/vp9_ext_ratectrl_test.cc  |  6 ++++++
 vp9/encoder/vp9_encoder.c      | 35 +++++++++++++++++++++++++++++++---
 vp9/encoder/vp9_ext_ratectrl.c |  3 ++-
 vp9/encoder/vp9_ext_ratectrl.h |  2 +-
 vpx/vpx_ext_ratectrl.h         |  9 ++++++++-
 5 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 4b3693a347..b6b5b2eaec 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -128,6 +128,7 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   } else {
     frame_decision->q_index = 100;
   }
+  frame_decision->max_frame_size = 0;
   return VPX_RC_OK;
 }
 
@@ -143,6 +144,11 @@ vpx_rc_status_t rc_update_encodeframe_result(
   if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
     EXPECT_EQ(encode_frame_result->sse, 0);
   }
+  if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
+    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0);
+  } else {
+    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100);
+  }
   return VPX_RC_OK;
 }
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6a21a1c183..2757bc4285 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4402,6 +4402,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   int qrange_adj = 1;
 #endif
 
+  // A flag which indicates whether we are recoding the current frame
+  // when the current frame size is larger than the max frame size in the
+  // external rate control model.
+  // This flag doesn't have any impact when external rate control is not used.
+  int ext_rc_recode = 0;
+  // Maximal frame size allowed by the external rate control.
+  // case: 0, we ignore the max frame size limit, and encode with the qindex
+  // passed in by the external rate control model.
+  // case: -1, we take VP9's decision for the max frame size.
+  int ext_rc_max_frame_size = 0;
+
 #if CONFIG_RATE_CTRL
   const FRAME_UPDATE_TYPE update_type =
       cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
@@ -4507,7 +4518,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       q = cpi->encode_command.external_quantize_index;
     }
 #endif
-    if (cpi->ext_ratectrl.ready) {
+    if (cpi->ext_ratectrl.ready && !ext_rc_recode) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
@@ -4526,6 +4537,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
                            "vp9_extrc_get_encodeframe_decision() failed");
       }
       q = encode_frame_decision.q_index;
+      ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
     }
 
     vp9_set_quantizer(cpi, q);
@@ -4567,7 +4579,24 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 
     if (cpi->ext_ratectrl.ready) {
-      break;
+      // In general, for the external rate control, we take the qindex provided
+      // as input and encode the frame with this qindex faithfully. However,
+      // in some extreme scenarios, the provided qindex leads to a massive
+      // overshoot of frame size. In this case, we fall back to VP9's decision
+      // to pick a new qindex and recode the frame. We return the new qindex
+      // through the API to the external model.
+      if (ext_rc_max_frame_size == 0) {
+        break;
+      } else if (ext_rc_max_frame_size == -1) {
+        if (rc->projected_frame_size < rc->max_frame_bandwidth) {
+          break;
+        }
+      } else {
+        if (rc->projected_frame_size < ext_rc_max_frame_size) {
+          break;
+        }
+      }
+      ext_rc_recode = 1;
     }
 #if CONFIG_RATE_CTRL
     // This part needs to be after save_coding_context() because
@@ -5501,7 +5530,7 @@ static void encode_frame_to_data_rate(
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
     vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
         &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf,
-        cm->bit_depth, cpi->oxcf.input_bit_depth);
+        cm->bit_depth, cpi->oxcf.input_bit_depth, cm->base_qindex);
     if (codec_status != VPX_CODEC_OK) {
       vpx_internal_error(&cm->error, codec_status,
                          "vp9_extrc_update_encodeframe_result() failed");
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index a27eb653ba..9f0098ab5a 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -168,7 +168,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
     const YV12_BUFFER_CONFIG *source_frame,
     const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
-    uint32_t input_bit_depth) {
+    uint32_t input_bit_depth, const int actual_encoding_qindex) {
   if (ext_ratectrl == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -180,6 +180,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     encode_frame_result.pixel_count =
         source_frame->y_crop_width * source_frame->y_crop_height +
         2 * source_frame->uv_crop_width * source_frame->uv_crop_height;
+    encode_frame_result.actual_encoding_qindex = actual_encoding_qindex;
 #if CONFIG_VP9_HIGHBITDEPTH
     vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth,
                          input_bit_depth);
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 11e9102a65..2142363085 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -43,6 +43,6 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
     const YV12_BUFFER_CONFIG *source_frame,
     const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
-    uint32_t input_bit_depth);
+    uint32_t input_bit_depth, int actual_encoding_qindex);
 
 #endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index dc4d856a8b..a193e55953 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -38,9 +38,15 @@ typedef void *vpx_rc_model_t;
  *
  * The encoder will receive the decision from the external rate control model
  * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
+ *
+ * If max_frame_size = 0, the encoding ignores max frame size limit.
+ * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
+ * If the encoded frame size is larger than max_frame_size, the frame is
+ * recoded to meet the size limit, following VP9's recoding principles.
  */
 typedef struct vpx_rc_encodeframe_decision {
-  int q_index; /**< Quantizer step index [0..255]*/
+  int q_index;        /**< Quantizer step index [0..255]*/
+  int max_frame_size; /**< Maximal frame size allowed to encode a frame*/
 } vpx_rc_encodeframe_decision_t;
 
 /*!\brief Information for the frame to be encoded.
@@ -82,6 +88,7 @@ typedef struct vpx_rc_encodeframe_result {
   int64_t sse;         /**< sum of squared error of the reconstructed frame */
   int64_t bit_count;   /**< number of bits spent on coding the frame*/
   int64_t pixel_count; /**< number of pixels in YUV planes of the frame*/
+  int actual_encoding_qindex; /**< the actual qindex used to encode the frame*/
 } vpx_rc_encodeframe_result_t;
 
 /*!\brief Status returned by rate control callback functions.

From b3506b33076b082ab5baf53deabac185cb255a38 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 2 Feb 2021 18:14:44 -0800
Subject: [PATCH 044/926] vp8_denoiser_sse2_test: use ASSERT instead of EXPECT

when test block contents to avoid producing unnecessary output on
failure.

Bug: webm:1718
Change-Id: Ie2cf8245ec8c03556549ad1eea65c8bef15a9735
---
 test/vp8_denoiser_sse2_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc
index 0197f143f3..ae547f007f 100644
--- a/test/vp8_denoiser_sse2_test.cc
+++ b/test/vp8_denoiser_sse2_test.cc
@@ -87,7 +87,7 @@ TEST_P(VP8DenoiserTest, BitexactCheck) {
     // Check bitexactness.
     for (int h = 0; h < 16; ++h) {
       for (int w = 0; w < 16; ++w) {
-        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+        ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
       }
     }
 
@@ -103,7 +103,7 @@ TEST_P(VP8DenoiserTest, BitexactCheck) {
     // Check bitexactness.
     for (int h = 0; h < 16; ++h) {
       for (int w = 0; w < 16; ++w) {
-        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+        ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
       }
     }
   }

From 158aa20c950bd905252041b7047c72d3aca71766 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Sun, 31 Jan 2021 22:11:33 -0800
Subject: [PATCH 045/926] svc: Unittest for ksvc flexible mode with no updates
 on TL > 0

Catches tsan issue fixed in: 7b93b56

Change-Id: I34b17c289afd0f8691987a1e4afa533f6c7f2806
---
 test/svc_datarate_test.cc | 72 +++++++++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 7 deletions(-)

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 0a7d0032c1..3af2255963 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -84,6 +84,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       prev_frame_width[i] = 320;
       prev_frame_height[i] = 240;
     }
+    ksvc_flex_noupd_tlenh_ = false;
   }
   virtual void BeginPassHook(unsigned int /*pass*/) {}
 
@@ -91,9 +92,10 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   // bypass/flexible mode. The pattern corresponds to the pattern
   // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
   // non-flexible mode, except that we disable inter-layer prediction.
-  void set_frame_flags_bypass_mode(
-      int tl, int num_spatial_layers, int is_key_frame,
-      vpx_svc_ref_frame_config_t *ref_frame_config) {
+  void set_frame_flags_bypass_mode(int tl, int num_spatial_layers,
+                                   int is_key_frame,
+                                   vpx_svc_ref_frame_config_t *ref_frame_config,
+                                   int noupdate_tlenh) {
     for (int sl = 0; sl < num_spatial_layers; ++sl)
       ref_frame_config->update_buffer_slot[sl] = 0;
 
@@ -154,6 +156,9 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
           ref_frame_config->update_buffer_slot[sl] |=
               1 << ref_frame_config->alt_fb_idx[sl];
         }
+        // Force no update on all spatial layers for temporal enhancement layer
+        // frames.
+        if (noupdate_tlenh) ref_frame_config->update_buffer_slot[sl] = 0;
       }
     }
   }
@@ -244,6 +249,22 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       }
     }
 
+    if (ksvc_flex_noupd_tlenh_) {
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config,
+                                  1);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+    }
+
     if (update_pattern_ && video->frame() >= 100) {
       vpx_svc_layer_id_t layer_id;
       if (video->frame() == 100) {
@@ -258,7 +279,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config);
+                                  number_spatial_layers_, 0, &ref_frame_config,
+                                  0);
       encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
     }
 
@@ -557,9 +579,14 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   }
 
   virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
-    double mismatch_psnr = compute_psnr(img1, img2);
-    mismatch_psnr_ += mismatch_psnr;
-    ++mismatch_nframes_;
+    // TODO(marpan): Look into why an assert is triggered in compute_psnr
+    // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh.
+    // Has to do with dropped frames in bypass/flexible svc mode.
+    if (!ksvc_flex_noupd_tlenh_) {
+      double mismatch_psnr = compute_psnr(img1, img2);
+      mismatch_psnr_ += mismatch_psnr;
+      ++mismatch_nframes_;
+    }
   }
 
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
@@ -604,6 +631,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   int num_resize_down_;
   unsigned int prev_frame_width[VPX_MAX_LAYERS];
   unsigned int prev_frame_height[VPX_MAX_LAYERS];
+  bool ksvc_flex_noupd_tlenh_;
 
  private:
   virtual void SetConfig(const int num_temporal_layer) {
@@ -1106,6 +1134,36 @@ TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL3TL4Threads) {
 #endif
 }
 
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 2 temporal layers, for KSVC in flexible mode with no update of reference
+// frames for all spatial layers on TL > 0 superframes.
+// Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL2TL4ThKSVCFlex) {
+  SetSvcConfig(3, 2);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ksvc_flex_noupd_tlenh_ = true;
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
+                          1.2);
+}
+
 // Params: speed setting, inter-layer prediction mode.
 class DatarateOnePassCbrSvcInterLayerPredSingleBR
     : public DatarateOnePassCbrSvc,

From 0d8354669ac525a27c78bc8c761e98e0f8c3905c Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 3 Feb 2021 22:09:24 -0800
Subject: [PATCH 046/926] svc: Fix an existing unittest for flexible mode

The flag update_pattern_ was being set to 0
(because it was set before reset) instead of 1.
And the example flexible mode pattern was not setting
non-reference frame on top temporal top spatial.

Change-Id: I8aee56ce13cc4e0d614126592f9d0f691fe527b0
---
 test/svc_datarate_test.cc          | 12 +++++++-----
 vp9/encoder/vp9_svc_layercontext.c |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 3af2255963..95d82ce54e 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -153,8 +153,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
           ref_frame_config->reference_last[sl] = 1;
           ref_frame_config->reference_golden[sl] = 0;
           ref_frame_config->reference_alt_ref[sl] = 0;
-          ref_frame_config->update_buffer_slot[sl] |=
-              1 << ref_frame_config->alt_fb_idx[sl];
+          // Non reference frame on top temporal top spatial.
+          ref_frame_config->update_buffer_slot[sl] = 0;
         }
         // Force no update on all spatial layers for temporal enhancement layer
         // frames.
@@ -275,8 +275,10 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       layer_id.spatial_layer_id = 0;
       layer_id.temporal_layer_id = (video->frame() % 2 != 0);
       temporal_layer_id_ = layer_id.temporal_layer_id;
-      for (int i = 0; i < number_spatial_layers_; i++)
+      for (int i = 0; i < number_spatial_layers_; i++) {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config.duration[i] = 1;
+      }
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
                                   number_spatial_layers_, 0, &ref_frame_config,
@@ -750,14 +752,14 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL2TLDynamicPatternChange) {
   cfg_.g_threads = 1;
   cfg_.rc_dropframe_thresh = 30;
   cfg_.kf_max_dist = 9999;
-  // Change SVC pattern on the fly.
-  update_pattern_ = 1;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   top_sl_width_ = 640;
   top_sl_height_ = 480;
   cfg_.rc_target_bitrate = 800;
   ResetModel();
+  // Change SVC pattern on the fly.
+  update_pattern_ = 1;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index b6c7c74e17..f9a0de62a0 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -956,7 +956,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
   if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
       !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame)
     svc->non_reference_frame = 1;
-  // For non-flexible mode, where update_buffer_slot is used, need to check if
+  // For flexible mode, where update_buffer_slot is used, need to check if
   // all buffer slots are not refreshed.
   if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
     if (svc->update_buffer_slot[svc->spatial_layer_id] != 0)

From 24bd0733efad6ee63eda3c49ecb730e316eb2483 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 Feb 2021 10:21:39 -0800
Subject: [PATCH 047/926] vp8_denoiser_sse2_test: disable BitexactCheck
 w/gcc-8+

this test fails under gcc 8-10, but not with other compilers

Bug: webm:1718
Change-Id: I8c6c7a25c4aaf019a7f91f835a1a2c9a731cfadc
---
 test/vp8_denoiser_sse2_test.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc
index ae547f007f..8cb84ddd8e 100644
--- a/test/vp8_denoiser_sse2_test.cc
+++ b/test/vp8_denoiser_sse2_test.cc
@@ -40,7 +40,12 @@ class VP8DenoiserTest : public ::testing::TestWithParam<int> {
   int increase_denoising_;
 };
 
+// TODO(https://crbug.com/webm/1718): This test fails with gcc 8-10.
+#if defined(__GNUC__) && __GNUC__ >= 8
+TEST_P(VP8DenoiserTest, DISABLED_BitexactCheck) {
+#else
 TEST_P(VP8DenoiserTest, BitexactCheck) {
+#endif
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 4000;
   const int stride = 16;

From 02392eecccde436a76aca6c86a6fdf643e98eb38 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 17 Feb 2021 17:34:27 -0800
Subject: [PATCH 048/926] Remove two pass related code from svc sample encoder.

SVC sample encoder is only supposed to be used for realtime SVC.

Bug: webm:1705
Change-Id: I5c0c3491732db3e148073aaf7f90ee8d662b57b5
---
 examples/vp9_spatial_svc_encoder.c | 69 +++---------------------------
 1 file changed, 6 insertions(+), 63 deletions(-)

diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index 6305572979..c37e608d17 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -66,12 +66,6 @@ static const arg_def_t kf_dist_arg =
     ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
 static const arg_def_t scale_factors_arg =
     ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
-static const arg_def_t passes_arg =
-    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
-static const arg_def_t pass_arg =
-    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
-static const arg_def_t fpf_name_arg =
-    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
 static const arg_def_t min_q_arg =
     ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
 static const arg_def_t max_q_arg =
@@ -125,9 +119,6 @@ static const arg_def_t *svc_args[] = { &frames_arg,
                                        &spatial_layers_arg,
                                        &kf_dist_arg,
                                        &scale_factors_arg,
-                                       &passes_arg,
-                                       &pass_arg,
-                                       &fpf_name_arg,
                                        &min_q_arg,
                                        &max_q_arg,
                                        &min_bitrate_arg,
@@ -173,8 +164,6 @@ typedef struct {
   uint32_t frames_to_skip;
   struct VpxInputContext input_ctx;
   stats_io_t rc_stats;
-  int passes;
-  int pass;
   int tune_content;
   int inter_layer_pred;
 } AppInput;
@@ -197,9 +186,6 @@ static void parse_command_line(int argc, const char **argv_,
   char **argi = NULL;
   char **argj = NULL;
   vpx_codec_err_t res;
-  int passes = 0;
-  int pass = 0;
-  const char *fpf_file_name = NULL;
   unsigned int min_bitrate = 0;
   unsigned int max_bitrate = 0;
   char string_options[1024] = { 0 };
@@ -289,18 +275,6 @@ static void parse_command_line(int argc, const char **argv_,
               sizeof(string_options) - strlen(string_options) - 1);
       strncat(string_options, arg.val,
               sizeof(string_options) - strlen(string_options) - 1);
-    } else if (arg_match(&arg, &passes_arg, argi)) {
-      passes = arg_parse_uint(&arg);
-      if (passes < 1 || passes > 2) {
-        die("Error: Invalid number of passes (%d)\n", passes);
-      }
-    } else if (arg_match(&arg, &pass_arg, argi)) {
-      pass = arg_parse_uint(&arg);
-      if (pass < 1 || pass > 2) {
-        die("Error: Invalid pass selected (%d)\n", pass);
-      }
-    } else if (arg_match(&arg, &fpf_name_arg, argi)) {
-      fpf_file_name = arg.val;
     } else if (arg_match(&arg, &min_q_arg, argi)) {
       strncat(string_options, " min-quantizers=",
               sizeof(string_options) - strlen(string_options) - 1);
@@ -355,35 +329,7 @@ static void parse_command_line(int argc, const char **argv_,
   if (strlen(string_options) > 0)
     vpx_svc_set_options(svc_ctx, string_options + 1);
 
-  if (passes == 0 || passes == 1) {
-    if (pass) {
-      fprintf(stderr, "pass is ignored since there's only one pass\n");
-    }
-    enc_cfg->g_pass = VPX_RC_ONE_PASS;
-  } else {
-    if (pass == 0) {
-      die("pass must be specified when passes is 2\n");
-    }
-
-    if (fpf_file_name == NULL) {
-      die("fpf must be specified when passes is 2\n");
-    }
-
-    if (pass == 1) {
-      enc_cfg->g_pass = VPX_RC_FIRST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) {
-        fatal("Failed to open statistics store");
-      }
-    } else {
-      enc_cfg->g_pass = VPX_RC_LAST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) {
-        fatal("Failed to open statistics store");
-      }
-      enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats);
-    }
-    app_input->passes = passes;
-    app_input->pass = pass;
-  }
+  enc_cfg->g_pass = VPX_RC_ONE_PASS;
 
   if (enc_cfg->rc_target_bitrate > 0) {
     if (min_bitrate > 0) {
@@ -1004,13 +950,11 @@ int main(int argc, const char **argv) {
   info.time_base.numerator = enc_cfg.g_timebase.num;
   info.time_base.denominator = enc_cfg.g_timebase.den;
 
-  if (!(app_input.passes == 2 && app_input.pass == 1)) {
-    // We don't save the bitstream for the 1st pass on two pass rate control
-    writer =
-        vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info);
-    if (!writer)
-      die("Failed to open %s for writing\n", app_input.output_filename);
-  }
+  writer =
+      vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing\n", app_input.output_filename);
+
 #if OUTPUT_RC_STATS
   // Write out spatial layer stream.
   // TODO(marpan/jianj): allow for writing each spatial and temporal stream.
@@ -1230,7 +1174,6 @@ int main(int argc, const char **argv) {
 #endif
   if (vpx_codec_destroy(&encoder))
     die_codec(&encoder, "Failed to destroy codec");
-  if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1);
   if (writer) {
     vpx_video_writer_close(writer);
   }

From ebefb90b75f07ea5ab06d6b2a5ea5355c843d266 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 26 Feb 2021 18:02:24 -0800
Subject: [PATCH 049/926] Remove comments for removed 'active_map' parameter

Change-Id: I8635f6121e13089c25e201df033d5bc68e2862b4
---
 vp9/encoder/vp9_lookahead.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index dbbe3af584..6ac6736673 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -82,15 +82,11 @@ int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx);
  * This function will copy the source image into a new framebuffer with
  * the expected stride/border.
  *
- * If active_map is non-NULL and there is only one frame in the queue, then copy
- * only active macroblocks.
- *
  * \param[in] ctx         Pointer to the lookahead context
  * \param[in] src         Pointer to the image to enqueue
  * \param[in] ts_start    Timestamp for the start of this frame
  * \param[in] ts_end      Timestamp for the end of this frame
  * \param[in] flags       Flags set on this frame
- * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,

From d0567bd779febe995020668cc7f6c1193e3e41d6 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Wed, 3 Mar 2021 16:45:42 +0000
Subject: [PATCH 050/926] Add fields into RC for Vizier ML experiments.

This patch adds fields into the RC data structure for the Vizier.

The added fields allow control of some extra rate control parameters
and rate distortion.

This patch also adds functions to initialize the various parameters
though many are not yet used / wired in and for now all are set to
default values. Ultimately many will be set through new command
line options.

Change-Id: I41591bb627d3837d2104fb363845adedbddf2e02
---
 vp9/encoder/vp9_encoder.c      |   1 +
 vp9/encoder/vp9_encoder.h      |   1 +
 vp9/encoder/vp9_ext_ratectrl.h |   2 +-
 vp9/encoder/vp9_firstpass.c    | 164 ++++++++++++++++++++++++++++-----
 vp9/encoder/vp9_ratectrl.h     |  13 +++
 vp9/encoder/vp9_rd.c           | 103 +++++++++++++++++----
 vp9/encoder/vp9_rd.h           |  14 +++
 7 files changed, 257 insertions(+), 41 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 2757bc4285..ecd15cb011 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2317,6 +2317,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   cpi->frame_info = vp9_get_frame_info(oxcf);
 
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+  vp9_init_rd_parameters(cpi);
 
   init_frame_indexes(cm);
   cpi->partition_search_skippable_frame = 0;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 8763a5e789..12520fb82a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -746,6 +746,7 @@ typedef struct VP9_COMP {
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
 
+  RD_CONTROL rd_ctrl;
   RD_OPT rd;
 
   CODING_CONTEXT coding_context;
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 2142363085..74fd68b96d 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -43,6 +43,6 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
     const YV12_BUFFER_CONFIG *source_frame,
     const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
-    uint32_t input_bit_depth, int actual_encoding_qindex);
+    uint32_t input_bit_depth, const int actual_encoding_qindex);
 
 #endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 2a9cf52898..fe7abef07e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -54,6 +54,31 @@
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
+#define SR_DIFF_PART 0.0015
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+#define LOW_CODED_ERR_PER_MB 10.0
+#define NCOUNT_FRAME_II_THRESH 6.0
+#define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_FRAME_BOOST 96.0
+
+#ifdef AGGRESSIVE_VBR
+#define KF_MAX_FRAME_BOOST 80.0
+#define MAX_KF_TOT_BOOST 4800
+#else
+#define KF_MAX_FRAME_BOOST 96.0
+#define MAX_KF_TOT_BOOST 5400
+#endif
+
+#define ZM_POWER_FACTOR 0.75
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+#define AV_WQ_FACTOR 4.0
+#define DEF_EPMB_LOW 2000.0
+
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
 
 #if ARF_STATS_OUTPUT
@@ -1807,14 +1832,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   twopass->arnr_strength_adjustment = 0;
 }
 
-#define SR_DIFF_PART 0.0015
-#define INTRA_PART 0.005
-#define DEFAULT_DECAY_LIMIT 0.75
-#define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
-#define LOW_CODED_ERR_PER_MB 10.0
-#define NCOUNT_FRAME_II_THRESH 6.0
-
 static double get_sr_decay_rate(const FRAME_INFO *frame_info,
                                 const FIRSTPASS_STATS *frame) {
   double sr_diff = (frame->sr_coded_error - frame->coded_error);
@@ -1853,8 +1870,6 @@ static double get_zero_motion_factor(const FRAME_INFO *frame_info,
   return VPXMIN(sr_decay, zero_motion_pct);
 }
 
-#define ZM_POWER_FACTOR 0.75
-
 static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
                                         const FIRSTPASS_STATS *frame_stats) {
   const double sr_decay_rate = get_sr_decay_rate(frame_info, frame_stats);
@@ -1942,8 +1957,6 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
   }
 }
 
-#define BASELINE_ERR_PER_MB 12500.0
-#define GF_MAX_BOOST 96.0
 static double calc_frame_boost(const FRAME_INFO *frame_info,
                                const FIRSTPASS_STATS *this_frame,
                                int avg_frame_qindex,
@@ -1965,7 +1978,7 @@ static double calc_frame_boost(const FRAME_INFO *frame_info,
   // Q correction and scalling
   frame_boost = frame_boost * boost_q_correction;
 
-  return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction);
+  return VPXMIN(frame_boost, GF_MAX_FRAME_BOOST * boost_q_correction);
 }
 
 static double kf_err_per_mb(VP9_COMP *cpi) {
@@ -3159,14 +3172,6 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
 #define MIN_SCAN_FRAMES_FOR_KF_BOOST 32
 #define KF_ABS_ZOOM_THRESH 6.0
 
-#ifdef AGGRESSIVE_VBR
-#define KF_MAX_FRAME_BOOST 80.0
-#define MAX_KF_TOT_BOOST 4800
-#else
-#define KF_MAX_FRAME_BOOST 96.0
-#define MAX_KF_TOT_BOOST 5400
-#endif
-
 int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
                                const FRAME_INFO *frame_info,
                                const FIRST_PASS_INFO *first_pass_info,
@@ -3470,6 +3475,113 @@ static int is_skippable_frame(const VP9_COMP *cpi) {
           twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
 }
 
+// Configure image size specific vizier parameters.
+// Later these will be set via additional command line options
+static void init_vizier_params(RATE_CONTROL *const rc, int screen_area) {
+  if (1) {
+    // Force defaults for now
+    rc->active_wq_factor = AV_WQ_FACTOR;
+    rc->base_err_per_mb = BASELINE_ERR_PER_MB;
+    rc->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+    rc->sr_diff_part = SR_DIFF_PART;
+    rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+    rc->gf_max_total_boost = MAX_GF_BOOST;
+    rc->kf_err_per_mb = DEF_EPMB_LOW;
+    rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;     // Max for first kf.
+    rc->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST / 2;  // Max for other kfs.
+    rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
+    rc->zm_power_factor = ZM_POWER_FACTOR;
+  } else {
+    // Vizer experimental parameters from training.
+    // Later these will be set via the command line.
+    if (screen_area <= 176 * 144) {
+      rc->active_wq_factor = 46.0;
+      rc->base_err_per_mb = 37597.399760969536;
+      rc->sr_default_decay_limit = 0.3905639800962774;
+      rc->sr_diff_part = 0.009599023654146284;
+      rc->gf_frame_max_boost = 87.27362648627846;
+      rc->gf_max_total_boost = MAX_GF_BOOST;
+      rc->kf_err_per_mb = 1854.8255436877148;
+      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
+      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      rc->zm_power_factor = 2.93715229184991;
+    } else if (screen_area <= 320 * 240) {
+      rc->active_wq_factor = 55.0;
+      rc->base_err_per_mb = 34525.33177195309;
+      rc->sr_default_decay_limit = 0.23901360046804604;
+      rc->sr_diff_part = 0.008581014394766773;
+      rc->gf_frame_max_boost = 127.34978204980285;
+      rc->gf_max_total_boost = MAX_GF_BOOST;
+      rc->kf_err_per_mb = 723.8337508755031;
+      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
+      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      rc->zm_power_factor = 3.5299221493593413;
+    } else if (screen_area <= 640 * 360) {
+      rc->active_wq_factor = 12.5;
+      rc->base_err_per_mb = 18823.978018028298;
+      rc->sr_default_decay_limit = 0.6043527690301296;
+      rc->sr_diff_part = 0.00343296783885544;
+      rc->gf_frame_max_boost = 75.17672317013668;
+      rc->gf_max_total_boost = MAX_GF_BOOST;
+      rc->kf_err_per_mb = 422.2871502380377;
+      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
+      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      rc->zm_power_factor = 2.265742666649307;
+    } else if (screen_area <= 854 * 480) {
+      rc->active_wq_factor = 51.5;
+      rc->base_err_per_mb = 33718.98307662595;
+      rc->sr_default_decay_limit = 0.33633414970713393;
+      rc->sr_diff_part = 0.00868988716928333;
+      rc->gf_frame_max_boost = 85.2868528581522;
+      rc->gf_max_total_boost = MAX_GF_BOOST;
+      rc->kf_err_per_mb = 1513.4883914008383;
+      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
+      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      rc->zm_power_factor = 3.552278528517416;
+    } else if (screen_area <= 1280 * 720) {
+      rc->active_wq_factor = 41.5;
+      rc->base_err_per_mb = 29527.46375825401;
+      rc->sr_default_decay_limit = 0.5009117586299728;
+      rc->sr_diff_part = 0.005007364627260114;
+      rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+      rc->gf_max_total_boost = MAX_GF_BOOST;
+      rc->kf_err_per_mb = 998.6342911785146;
+      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
+      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      rc->zm_power_factor = 2.568627575572356;
+    } else if (screen_area <= 1920 * 1080) {
+      rc->active_wq_factor = 31.0;
+      rc->base_err_per_mb = 34474.723463367416;
+      rc->sr_default_decay_limit = 0.23346886902707745;
+      rc->sr_diff_part = 0.011431716637966029;
+      rc->gf_frame_max_boost = 81.00472969483079;
+      rc->gf_max_total_boost = MAX_GF_BOOST;
+      rc->kf_err_per_mb = 35931.25734431429;
+      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
+      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      rc->zm_power_factor = 5.5776463538431935;
+    } else {
+      rc->active_wq_factor = AV_WQ_FACTOR;
+      rc->base_err_per_mb = BASELINE_ERR_PER_MB;
+      rc->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+      rc->sr_diff_part = SR_DIFF_PART;
+      rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+      rc->gf_max_total_boost = MAX_GF_BOOST;
+      rc->kf_err_per_mb = DEF_EPMB_LOW;
+      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
+      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      rc->zm_power_factor = ZM_POWER_FACTOR;
+    }
+  }
+}
+
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -3480,6 +3592,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   if (!twopass->stats_in) return;
 
+  // Configure image size specific vizier parameters
+  if (cm->current_video_frame == 0) {
+    unsigned int screen_area = (cm->width * cm->height);
+
+    init_vizier_params(rc, screen_area);
+  }
+
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
   if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
@@ -3605,9 +3724,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
-#define MINQ_ADJ_LIMIT 48
-#define MINQ_ADJ_LIMIT_CQ 20
-#define HIGH_UNDERSHOOT_RATIO 2
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 0120f90a01..7437d309e0 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -204,6 +204,19 @@ typedef struct {
   int preserve_arf_as_gld;
   int preserve_next_arf_as_gld;
   int show_arf_as_gld;
+
+  // Vizeir project experimental rate control parameters.
+  double active_wq_factor;
+  double base_err_per_mb;
+  double sr_default_decay_limit;
+  double sr_diff_part;
+  double kf_frame_max_boost_first;  // Max for first kf in a chunk.
+  double kf_frame_max_boost_subs;   // Max for subsequent mid chunk kfs.
+  double kf_max_total_boost;
+  double kf_err_per_mb;
+  double gf_frame_max_boost;
+  double gf_max_total_boost;
+  double zm_power_factor;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 34c74424ce..b126d8708d 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -197,28 +197,99 @@ static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
                                                               128, 144, 144 };
 
+// Configure Vizier RD parameters.
+// Later this function will use passed in command line values.
+void vp9_init_rd_parameters(VP9_COMP *cpi) {
+  RD_CONTROL *const rdc = &cpi->rd_ctrl;
+  unsigned int screen_area = (cpi->common.width * cpi->common.height);
+
+  // Make sure this function is floating point safe.
+  vpx_clear_system_state();
+
+  if (1) {
+    // Non/pre-Vizer defaults
+    rdc->rd_mult_q_sq_inter_low_qp = 4.0;
+    rdc->rd_mult_q_sq_inter_mid_qp = 4.5;
+    rdc->rd_mult_q_sq_inter_high_qp = 3.0;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.0;
+    rdc->rd_mult_q_sq_key_low_qp = 3.5;
+    rdc->rd_mult_q_sq_key_mid_qp = 4.5;
+    rdc->rd_mult_q_sq_key_high_qp = 7.5;
+  } else if (screen_area <= 176 * 144) {
+    rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044;
+    rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025;
+    rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739;
+    rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155;
+    rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333;
+  } else if (screen_area <= 320 * 240) {
+    rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458;
+    rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102;
+    rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181;
+    rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771;
+    rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166;
+  } else if (screen_area <= 640 * 360) {
+    rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025;
+    rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013;
+    rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551;
+    rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526;
+    rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747;
+  } else if (screen_area <= 854 * 480) {
+    rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539;
+    rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073;
+    rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201;
+    rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473;
+    rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566;
+  } else if (screen_area <= 1280 * 720) {
+    rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541;
+    rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107;
+    rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538;
+    rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484;
+    rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326;
+  } else if (screen_area <= 1920 * 1080) {
+    rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797;
+    rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199;
+    rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023;
+    rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298;
+    rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089;
+  }
+}
+
 int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
-  // largest dc_quant is 21387, therefore rdmult should always fit in int32_t
+  const RD_CONTROL *rdc = &cpi->rd_ctrl;
   const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
-  uint32_t rdmult = q * q;
+  // largest dc_quant is 21387, therefore rdmult should fit in int32_t
+  int rdmult = q * q;
+
+  // Make sure this function is floating point safe.
+  vpx_clear_system_state();
 
   if (cpi->common.frame_type != KEY_FRAME) {
-    if (qindex < 128)
-      rdmult = rdmult * 4;
-    else if (qindex < 190)
-      rdmult = rdmult * 4 + rdmult / 2;
-    else
-      rdmult = rdmult * 3;
+    if (qindex < 128) {
+      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_low_qp);
+    } else if (qindex < 190) {
+      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_mid_qp);
+    } else {
+      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_high_qp);
+    }
   } else {
-    if (qindex < 64)
-      rdmult = rdmult * 4;
-    else if (qindex <= 128)
-      rdmult = rdmult * 3 + rdmult / 2;
-    else if (qindex < 190)
-      rdmult = rdmult * 4 + rdmult / 2;
-    else
-      rdmult = rdmult * 7 + rdmult / 2;
+    if (qindex < 64) {
+      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_ultralow_qp);
+    } else if (qindex <= 128) {
+      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_low_qp);
+    } else if (qindex < 190) {
+      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_mid_qp);
+
+    } else {
+      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_high_qp);
+    }
   }
+
 #if CONFIG_VP9_HIGHBITDEPTH
   switch (cpi->common.bit_depth) {
     case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 4c04c95482..2c9f5e7408 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -101,6 +101,18 @@ typedef enum {
   THR_INTRA,
 } THR_MODES_SUB8X8;
 
+typedef struct {
+  // RD control parameters
+  // Added for Vizier project.
+  double rd_mult_q_sq_inter_low_qp;
+  double rd_mult_q_sq_inter_mid_qp;
+  double rd_mult_q_sq_inter_high_qp;
+  double rd_mult_q_sq_key_ultralow_qp;
+  double rd_mult_q_sq_key_low_qp;
+  double rd_mult_q_sq_key_mid_qp;
+  double rd_mult_q_sq_key_high_qp;
+} RD_CONTROL;
+
 typedef struct RD_OPT {
   // Thresh_mult is used to set a threshold for the rd score. A higher value
   // means that we will accept the best mode so far more often. This number
@@ -144,6 +156,8 @@ struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
 
+void vp9_init_rd_parameters(struct VP9_COMP *cpi);
+
 int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex);
 
 int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);

From 2570e33eceb7c489851e689a07473d1889206059 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 3 Mar 2021 14:46:48 -0800
Subject: [PATCH 051/926] override assembler with --as option on msvs

Bug: webm:1709
Change-Id: I962a64c00042fe95cc1cd845b187f71ad6cfd1b7
---
 build/make/configure.sh        |  4 ----
 build/make/gen_msvs_vcxproj.sh | 10 ++++++----
 examples.mk                    |  1 +
 libs.mk                        |  6 ++++++
 tools.mk                       |  1 +
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index c4e938fc72..81d30a16c7 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1296,10 +1296,6 @@ EOF
           enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
           ;;
         vs*)
-          # When building with Microsoft Visual Studio the assembler is
-          # invoked directly. Checking at configure time is unnecessary.
-          # Skip the check by setting AS arbitrarily
-          AS=msvs
           msvs_arch_dir=x86-msvs
           case ${tgt_cc##vs} in
             14)
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index bb1c31d230..6f91ad4781 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -157,6 +157,8 @@ for opt in "$@"; do
         ;;
         --lib) proj_kind="lib"
         ;;
+        --as=*) as="${optval}"
+        ;;
         --src-path-bare=*)
             src_path_bare=$(fix_path "$optval")
             src_path_bare=${src_path_bare%/}
@@ -247,13 +249,13 @@ libs=${libs// /;}
 case "$target" in
     x86_64*)
         platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="${as} -Xvc -gcv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="${as} -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     x86*)
         platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="${as} -Xvc -gcv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="${as} -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     arm64*)
         platforms[0]="ARM64"
diff --git a/examples.mk b/examples.mk
index a28e529359..42886f1e15 100644
--- a/examples.mk
+++ b/examples.mk
@@ -376,6 +376,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$$(AS) \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^
diff --git a/libs.mk b/libs.mk
index b5bc35755c..cabd4ed141 100644
--- a/libs.mk
+++ b/libs.mk
@@ -232,6 +232,7 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
+            --as=$(AS) \
             $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
@@ -262,6 +263,7 @@ vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
+            --as=$(AS) \
             $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.cc, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \
@@ -536,6 +538,7 @@ gtest.$(VCPROJ_SFX): $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.c
             --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             -D_VARIADIC_MAX=10 \
             --out=gtest.$(VCPROJ_SFX) $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" -I"$(SRC_PATH_BARE)/third_party/googletest/src"
@@ -552,6 +555,7 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
@@ -574,6 +578,7 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
@@ -592,6 +597,7 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
             -D_VARIADIC_MAX=10 \
             --proj-guid=30458F88-1BC6-4689-B41C-50F3737AAB27 \
             --ver=$(CONFIG_VS_VERSION) \
+            --as=$(AS) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
diff --git a/tools.mk b/tools.mk
index 1d005b2acf..dd2ebeb3d5 100644
--- a/tools.mk
+++ b/tools.mk
@@ -79,6 +79,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$$(AS) \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^

From f7c386bab0e637dff65c3fe546a83e9564028aff Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 3 Mar 2021 17:33:30 -0800
Subject: [PATCH 052/926] Use -std=gnu++11 instead of -std=c++11

Cygwin and msys2 have stricter compliance requirement over standard c
headers.

Bug: webm:1708
Change-Id: I676b1227b9dd304149e50016468df0f057c6a78f
---
 configure                      | 12 ++++++------
 examples/vpx_dec_fuzzer.cc     |  2 +-
 third_party/libwebm/Android.mk |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index f7e11aaf2d..da631a45e1 100755
--- a/configure
+++ b/configure
@@ -731,33 +731,33 @@ process_toolchain() {
             soft_enable libyuv
         ;;
         *-android-*)
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             soft_enable libyuv
             # GTestLog must be modified to use Android logging utilities.
         ;;
         *-darwin-*)
-            check_add_cxxflags -std=c++11
+            check_add_cxxflags -std=gnu++11
             # iOS/ARM builds do not work with gtest. This does not match
             # x86 targets.
         ;;
         *-iphonesimulator-*)
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             soft_enable libyuv
         ;;
         *-win*)
             # Some mingw toolchains don't have pthread available by default.
             # Treat these more like visual studio where threading in gtest
             # would be disabled for the same reason.
-            check_add_cxxflags -std=c++11 && soft_enable unit_tests \
+            check_add_cxxflags -std=gnu++11 && soft_enable unit_tests \
               && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
         ;;
         *)
-            enabled pthread_h && check_add_cxxflags -std=c++11 \
+            enabled pthread_h && check_add_cxxflags -std=gnu++11 \
               && soft_enable unit_tests
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
diff --git a/examples/vpx_dec_fuzzer.cc b/examples/vpx_dec_fuzzer.cc
index 34c0e11238..5eba9d74da 100644
--- a/examples/vpx_dec_fuzzer.cc
+++ b/examples/vpx_dec_fuzzer.cc
@@ -41,7 +41,7 @@
    $make -j32
 
  * Build vp9 fuzzer
-   $ $CXX $CXXFLAGS -std=c++11 -DDECODER=vp9 \
+   $ $CXX $CXXFLAGS -std=gnu++11 -DDECODER=vp9 \
    -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \
    ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \
    ./libvpx.a -Wl,--end-group
diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk
index b46ba101d4..3b3dd1d390 100644
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@@ -3,7 +3,7 @@ LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
 LOCAL_MODULE:= libwebm
 LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
-LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=gnu++11
 LOCAL_C_INCLUDES:= $(LOCAL_PATH)
 LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
 

From 56b1a197b265bc13b8fb8c6df85556185b188f9b Mon Sep 17 00:00:00 2001
From: James Touton <bekenn@gmail.com>
Date: Thu, 4 Mar 2021 18:43:29 -0800
Subject: [PATCH 053/926] Check for _WIN32 instead of WIN32.

_WIN32 is predefined for the Windows platform in MSVC, whereas WIN32 is not, and WIN32 is also not defined in the makefiles.

Change-Id: I8b58e42d891608dbe1e1313dc9629c2be588d9ec
---
 vp8/decoder/threading.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 561922de32..491e2ce4c1 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -10,7 +10,7 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
-#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
+#if !defined(_WIN32) && CONFIG_OS_SUPPORT == 1
 #include <unistd.h>
 #endif
 #include "onyxd_int.h"

From f27c62c5dfb75c27130e03c724fc5e6f22e03510 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Thu, 4 Mar 2021 17:10:09 +0000
Subject: [PATCH 054/926] Further integration for Vizier.

Further integration of Vizier adjustable parameters,

This patch connects up additional configurable two pass rate control
parameters for the Vizier project.  This still needs to be connected up
to a command line interface and at the moment should still be using
default values that match previous behavior.

Do not submit until verified that defaults are all working correctly.

Change-Id: If1241c2dba6759395e6efa349c4659a0c345361d
---
 vp9/encoder/vp9_firstpass.c | 340 +++++++++++++++++++-----------------
 vp9/encoder/vp9_firstpass.h |  17 +-
 vp9/encoder/vp9_ratectrl.h  |  13 --
 3 files changed, 191 insertions(+), 179 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index fe7abef07e..6d6aa5087f 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -77,7 +77,6 @@
 #define MINQ_ADJ_LIMIT_CQ 20
 #define HIGH_UNDERSHOOT_RATIO 2
 #define AV_WQ_FACTOR 4.0
-#define DEF_EPMB_LOW 2000.0
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
 
@@ -1833,6 +1832,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 }
 
 static double get_sr_decay_rate(const FRAME_INFO *frame_info,
+                                const TWO_PASS *const twopass,
                                 const FIRSTPASS_STATS *frame) {
   double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
@@ -1854,28 +1854,31 @@ static double get_sr_decay_rate(const FRAME_INFO *frame_info,
 
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
     sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part -
+    sr_decay = 1.0 - (twopass->sr_diff_part * sr_diff) - motion_amplitude_part -
                (INTRA_PART * modified_pcnt_intra);
   }
-  return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT);
+  return VPXMAX(sr_decay, twopass->sr_default_decay_limit);
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
 static double get_zero_motion_factor(const FRAME_INFO *frame_info,
+                                     const TWO_PASS *const twopass,
                                      const FIRSTPASS_STATS *frame_stats) {
   const double zero_motion_pct =
       frame_stats->pcnt_inter - frame_stats->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(frame_info, frame_stats);
+  double sr_decay = get_sr_decay_rate(frame_info, twopass, frame_stats);
   return VPXMIN(sr_decay, zero_motion_pct);
 }
 
 static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
+                                        const TWO_PASS *const twopass,
                                         const FIRSTPASS_STATS *frame_stats) {
-  const double sr_decay_rate = get_sr_decay_rate(frame_info, frame_stats);
+  const double sr_decay_rate =
+      get_sr_decay_rate(frame_info, twopass, frame_stats);
   const double zero_motion_factor =
       (0.95 * pow((frame_stats->pcnt_inter - frame_stats->pcnt_motion),
-                  ZM_POWER_FACTOR));
+                  twopass->zm_power_factor));
 
   return VPXMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
@@ -1959,6 +1962,7 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
 
 static double calc_frame_boost(const FRAME_INFO *frame_info,
                                const FIRSTPASS_STATS *this_frame,
+                               const TWO_PASS *const twopass,
                                int avg_frame_qindex,
                                double this_frame_mv_in_out) {
   double frame_boost;
@@ -1968,7 +1972,7 @@ static double calc_frame_boost(const FRAME_INFO *frame_info,
   const double active_area = calculate_active_area(frame_info, this_frame);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * active_area) /
+  frame_boost = (twopass->base_err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
 
   // Small adjustment for cases where there is a zoom out
@@ -1978,37 +1982,25 @@ static double calc_frame_boost(const FRAME_INFO *frame_info,
   // Q correction and scalling
   frame_boost = frame_boost * boost_q_correction;
 
-  return VPXMIN(frame_boost, GF_MAX_FRAME_BOOST * boost_q_correction);
-}
-
-static double kf_err_per_mb(VP9_COMP *cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
-  unsigned int screen_area = (cm->width * cm->height);
-
-  // Use a different error per mb factor for calculating boost for
-  //  different formats.
-  if (screen_area < 1280 * 720) {
-    return 2000.0;
-  } else if (screen_area < 1920 * 1080) {
-    return 500.0;
-  }
-  return 250.0;
+  return VPXMIN(frame_boost, twopass->gf_frame_max_boost * boost_q_correction);
 }
 
 static double calc_kf_frame_boost(VP9_COMP *cpi,
                                   const FIRSTPASS_STATS *this_frame,
                                   double *sr_accumulator,
                                   double this_frame_mv_in_out,
-                                  double max_boost) {
+                                  double zm_factor) {
+  TWO_PASS *const twopass = &cpi->twopass;
   double frame_boost;
   const double lq = vp9_convert_qindex_to_q(
       cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
   const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
   const double active_area =
       calculate_active_area(&cpi->frame_info, this_frame);
+  double max_boost;
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = (kf_err_per_mb(cpi) * active_area) /
+  frame_boost = (twopass->kf_err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
 
   // Update the accumulator for second ref error difference.
@@ -2027,13 +2019,20 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
   // boost calculation.
   frame_boost = ((frame_boost + 40.0) * boost_q_correction);
 
-  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+  // Maximum allowed boost this frame. May be different for first vs subsequent
+  // key frames.
+  max_boost = (cpi->common.current_video_frame == 0)
+                  ? twopass->kf_frame_max_boost_first
+                  : twopass->kf_frame_max_boost_subs;
+  max_boost *= zm_factor * boost_q_correction;
+
+  return VPXMIN(frame_boost, max_boost);
 }
 
 static int compute_arf_boost(const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int arf_show_idx, int f_frames, int b_frames,
-                             int avg_frame_qindex) {
+                             TWO_PASS *const twopass, int arf_show_idx,
+                             int f_frames, int b_frames, int avg_frame_qindex) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int i;
   double boost_score = 0.0;
   double mv_ratio_accumulator = 0.0;
@@ -2064,14 +2063,15 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(frame_info, this_frame);
+      decay_accumulator *=
+          get_prediction_decay_rate(frame_info, twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-    boost_score += decay_accumulator * calc_frame_boost(frame_info, this_frame,
-                                                        avg_frame_qindex,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator *
+                   calc_frame_boost(frame_info, this_frame, twopass,
+                                    avg_frame_qindex, this_frame_mv_in_out);
   }
 
   arf_boost = (int)boost_score;
@@ -2104,14 +2104,15 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(frame_info, this_frame);
+      decay_accumulator *=
+          get_prediction_decay_rate(frame_info, twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-    boost_score += decay_accumulator * calc_frame_boost(frame_info, this_frame,
-                                                        avg_frame_qindex,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator *
+                   calc_frame_boost(frame_info, this_frame, twopass,
+                                    avg_frame_qindex, this_frame_mv_in_out);
   }
   arf_boost += (int)boost_score;
 
@@ -2127,8 +2128,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) {
   TWO_PASS *const twopass = &cpi->twopass;
   const int avg_inter_frame_qindex = cpi->rc.avg_frame_qindex[INTER_FRAME];
   int arf_show_idx = get_show_idx(twopass);
-  return compute_arf_boost(frame_info, &twopass->first_pass_info, arf_show_idx,
-                           f_frames, b_frames, avg_inter_frame_qindex);
+  return compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames,
+                           b_frames, avg_inter_frame_qindex);
 }
 
 // Calculate a section intra ratio used in setting max loop filter.
@@ -2557,10 +2558,11 @@ typedef struct RANGE {
  * structs.
  */
 static int get_gop_coding_frame_num(
-    int *use_alt_ref, const FRAME_INFO *frame_info,
-    const FIRST_PASS_INFO *first_pass_info, const RATE_CONTROL *rc,
-    int gf_start_show_idx, const RANGE *active_gf_interval,
-    double gop_intra_factor, int lag_in_frames) {
+    int *use_alt_ref, const FRAME_INFO *frame_info, TWO_PASS *const twopass,
+    const RATE_CONTROL *rc, int gf_start_show_idx,
+    const RANGE *active_gf_interval, double gop_intra_factor,
+    int lag_in_frames) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double loop_decay_rate = 1.00;
   double mv_ratio_accumulator = 0.0;
   double this_frame_mv_in_out = 0.0;
@@ -2603,13 +2605,14 @@ static int get_gop_coding_frame_num(
     if ((rc->frames_since_key + gop_coding_frames - 1) > 1) {
       zero_motion_accumulator =
           VPXMIN(zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, next_frame));
+                 get_zero_motion_factor(frame_info, twopass, next_frame));
     }
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       double last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(frame_info, next_frame);
+      loop_decay_rate =
+          get_prediction_decay_rate(frame_info, twopass, next_frame);
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -2807,14 +2810,14 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
       use_alt_ref = gop_command->use_alt_ref;
     } else {
       gop_coding_frames = get_gop_coding_frame_num(
-          &use_alt_ref, frame_info, first_pass_info, rc, gf_start_show_idx,
+          &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
           &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
       use_alt_ref &= allow_alt_ref;
     }
   }
 #else
   gop_coding_frames = get_gop_coding_frame_num(
-      &use_alt_ref, frame_info, first_pass_info, rc, gf_start_show_idx,
+      &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
       &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
   use_alt_ref &= allow_alt_ref;
 #endif
@@ -2836,8 +2839,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
 
     // Calculate the boost for alt ref.
     rc->gfu_boost =
-        compute_arf_boost(frame_info, first_pass_info, arf_show_idx, f_frames,
-                          b_frames, avg_inter_frame_qindex);
+        compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, b_frames,
+                          avg_inter_frame_qindex);
     rc->source_alt_ref_pending = 1;
   } else {
     const int f_frames = gop_coding_frames - 1;
@@ -2847,9 +2850,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     const int gld_show_idx =
         VPXMIN(gf_start_show_idx + 1, fps_get_num_frames(first_pass_info));
     const int arf_boost =
-        compute_arf_boost(frame_info, first_pass_info, gld_show_idx, f_frames,
-                          b_frames, avg_inter_frame_qindex);
-    rc->gfu_boost = VPXMIN(MAX_GF_BOOST, arf_boost);
+        compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames,
+                          avg_inter_frame_qindex);
+    rc->gfu_boost = VPXMIN(twopass->gf_max_total_boost, arf_boost);
     rc->source_alt_ref_pending = 0;
   }
 
@@ -2952,7 +2955,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
         cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
         group_av_noise, vbr_group_bits_per_frame);
     twopass->active_worst_quality =
-        (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
+        (int)((tmp_q + (twopass->active_worst_quality *
+                        (twopass->active_wq_factor - 1))) /
+              twopass->active_wq_factor);
 
 #if CONFIG_ALWAYS_ADJUST_BPM
     // Reset rolling actual and target bits counters for ARF groups.
@@ -3174,8 +3179,9 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
 
 int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
                                const FRAME_INFO *frame_info,
-                               const FIRST_PASS_INFO *first_pass_info,
-                               int kf_show_idx, int min_gf_interval) {
+                               const TWO_PASS *const twopass, int kf_show_idx,
+                               int min_gf_interval) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
   int j;
   int frames_to_key;
@@ -3202,7 +3208,8 @@ int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
           break;
 
         // How fast is the prediction quality decaying?
-        loop_decay_rate = get_prediction_decay_rate(frame_info, next_frame);
+        loop_decay_rate =
+            get_prediction_decay_rate(frame_info, twopass, next_frame);
 
         // We want to know something about the recent past... rather than
         // as used elsewhere where we are concerned with decay in prediction
@@ -3289,7 +3296,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
                                      mean_mod_score, av_err);
 
   rc->frames_to_key = vp9_get_frames_to_next_key(
-      oxcf, frame_info, first_pass_info, kf_show_idx, rc->min_gf_interval);
+      oxcf, frame_info, twopass, kf_show_idx, rc->min_gf_interval);
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
@@ -3369,9 +3376,9 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
       // Monitor for static sections.
       // First frame in kf group the second ref indicator is invalid.
       if (i > 0) {
-        zero_motion_accumulator =
-            VPXMIN(zero_motion_accumulator,
-                   get_zero_motion_factor(&cpi->frame_info, &next_frame));
+        zero_motion_accumulator = VPXMIN(
+            zero_motion_accumulator,
+            get_zero_motion_factor(&cpi->frame_info, twopass, &next_frame));
       } else {
         zero_motion_accumulator =
             next_frame.pcnt_inter - next_frame.pcnt_motion;
@@ -3385,8 +3392,8 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
       // the first key frame or it points to a refernce before the new key
       // frame.
       if (i < 2) sr_accumulator = 0.0;
-      frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0,
-                                        KF_MAX_FRAME_BOOST * zm_factor);
+      frame_boost =
+          calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, zm_factor);
 
       boost_score += frame_boost;
 
@@ -3415,12 +3422,12 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   // Special case for static / slide show content but dont apply
   // if the kf group is very short.
   if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
-    rc->kf_boost = MAX_KF_TOT_BOOST;
+    rc->kf_boost = twopass->kf_max_total_boost;
   } else {
-    // Apply various clamps for min and max boost
+    // Apply various clamps for min and max oost
     rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
     rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-    rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+    rc->kf_boost = VPXMIN(rc->kf_boost, twopass->kf_max_total_boost);
   }
 
   // Work out how many bits to allocate for the key frame itself.
@@ -3477,107 +3484,113 @@ static int is_skippable_frame(const VP9_COMP *cpi) {
 
 // Configure image size specific vizier parameters.
 // Later these will be set via additional command line options
-static void init_vizier_params(RATE_CONTROL *const rc, int screen_area) {
+static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
   if (1) {
     // Force defaults for now
-    rc->active_wq_factor = AV_WQ_FACTOR;
-    rc->base_err_per_mb = BASELINE_ERR_PER_MB;
-    rc->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
-    rc->sr_diff_part = SR_DIFF_PART;
-    rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
-    rc->gf_max_total_boost = MAX_GF_BOOST;
-    rc->kf_err_per_mb = DEF_EPMB_LOW;
-    rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;     // Max for first kf.
-    rc->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST / 2;  // Max for other kfs.
-    rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
-    rc->zm_power_factor = ZM_POWER_FACTOR;
+    twopass->active_wq_factor = AV_WQ_FACTOR;
+    twopass->base_err_per_mb = BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+    twopass->sr_diff_part = SR_DIFF_PART;
+    twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost = MAX_GF_BOOST;
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb = 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb = 500.0;
+    } else {
+      twopass->kf_err_per_mb = 250.0;
+    }
+    twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+    twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+    twopass->zm_power_factor = ZM_POWER_FACTOR;
   } else {
     // Vizer experimental parameters from training.
     // Later these will be set via the command line.
     if (screen_area <= 176 * 144) {
-      rc->active_wq_factor = 46.0;
-      rc->base_err_per_mb = 37597.399760969536;
-      rc->sr_default_decay_limit = 0.3905639800962774;
-      rc->sr_diff_part = 0.009599023654146284;
-      rc->gf_frame_max_boost = 87.27362648627846;
-      rc->gf_max_total_boost = MAX_GF_BOOST;
-      rc->kf_err_per_mb = 1854.8255436877148;
-      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
-      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      rc->zm_power_factor = 2.93715229184991;
+      twopass->active_wq_factor = 46.0;
+      twopass->base_err_per_mb = 37597.399760969536;
+      twopass->sr_default_decay_limit = 0.3905639800962774;
+      twopass->sr_diff_part = 0.009599023654146284;
+      twopass->gf_frame_max_boost = 87.27362648627846;
+      twopass->gf_max_total_boost = MAX_GF_BOOST;
+      twopass->kf_err_per_mb = 1854.8255436877148;
+      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      twopass->zm_power_factor = 2.93715229184991;
     } else if (screen_area <= 320 * 240) {
-      rc->active_wq_factor = 55.0;
-      rc->base_err_per_mb = 34525.33177195309;
-      rc->sr_default_decay_limit = 0.23901360046804604;
-      rc->sr_diff_part = 0.008581014394766773;
-      rc->gf_frame_max_boost = 127.34978204980285;
-      rc->gf_max_total_boost = MAX_GF_BOOST;
-      rc->kf_err_per_mb = 723.8337508755031;
-      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
-      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      rc->zm_power_factor = 3.5299221493593413;
+      twopass->active_wq_factor = 55.0;
+      twopass->base_err_per_mb = 34525.33177195309;
+      twopass->sr_default_decay_limit = 0.23901360046804604;
+      twopass->sr_diff_part = 0.008581014394766773;
+      twopass->gf_frame_max_boost = 127.34978204980285;
+      twopass->gf_max_total_boost = MAX_GF_BOOST;
+      twopass->kf_err_per_mb = 723.8337508755031;
+      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      twopass->zm_power_factor = 3.5299221493593413;
     } else if (screen_area <= 640 * 360) {
-      rc->active_wq_factor = 12.5;
-      rc->base_err_per_mb = 18823.978018028298;
-      rc->sr_default_decay_limit = 0.6043527690301296;
-      rc->sr_diff_part = 0.00343296783885544;
-      rc->gf_frame_max_boost = 75.17672317013668;
-      rc->gf_max_total_boost = MAX_GF_BOOST;
-      rc->kf_err_per_mb = 422.2871502380377;
-      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
-      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      rc->zm_power_factor = 2.265742666649307;
+      twopass->active_wq_factor = 12.5;
+      twopass->base_err_per_mb = 18823.978018028298;
+      twopass->sr_default_decay_limit = 0.6043527690301296;
+      twopass->sr_diff_part = 0.00343296783885544;
+      twopass->gf_frame_max_boost = 75.17672317013668;
+      twopass->gf_max_total_boost = MAX_GF_BOOST;
+      twopass->kf_err_per_mb = 422.2871502380377;
+      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      twopass->zm_power_factor = 2.265742666649307;
     } else if (screen_area <= 854 * 480) {
-      rc->active_wq_factor = 51.5;
-      rc->base_err_per_mb = 33718.98307662595;
-      rc->sr_default_decay_limit = 0.33633414970713393;
-      rc->sr_diff_part = 0.00868988716928333;
-      rc->gf_frame_max_boost = 85.2868528581522;
-      rc->gf_max_total_boost = MAX_GF_BOOST;
-      rc->kf_err_per_mb = 1513.4883914008383;
-      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
-      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      rc->zm_power_factor = 3.552278528517416;
+      twopass->active_wq_factor = 51.5;
+      twopass->base_err_per_mb = 33718.98307662595;
+      twopass->sr_default_decay_limit = 0.33633414970713393;
+      twopass->sr_diff_part = 0.00868988716928333;
+      twopass->gf_frame_max_boost = 85.2868528581522;
+      twopass->gf_max_total_boost = MAX_GF_BOOST;
+      twopass->kf_err_per_mb = 1513.4883914008383;
+      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      twopass->zm_power_factor = 3.552278528517416;
     } else if (screen_area <= 1280 * 720) {
-      rc->active_wq_factor = 41.5;
-      rc->base_err_per_mb = 29527.46375825401;
-      rc->sr_default_decay_limit = 0.5009117586299728;
-      rc->sr_diff_part = 0.005007364627260114;
-      rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
-      rc->gf_max_total_boost = MAX_GF_BOOST;
-      rc->kf_err_per_mb = 998.6342911785146;
-      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
-      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      rc->zm_power_factor = 2.568627575572356;
+      twopass->active_wq_factor = 41.5;
+      twopass->base_err_per_mb = 29527.46375825401;
+      twopass->sr_default_decay_limit = 0.5009117586299728;
+      twopass->sr_diff_part = 0.005007364627260114;
+      twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+      twopass->gf_max_total_boost = MAX_GF_BOOST;
+      twopass->kf_err_per_mb = 998.6342911785146;
+      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      twopass->zm_power_factor = 2.568627575572356;
     } else if (screen_area <= 1920 * 1080) {
-      rc->active_wq_factor = 31.0;
-      rc->base_err_per_mb = 34474.723463367416;
-      rc->sr_default_decay_limit = 0.23346886902707745;
-      rc->sr_diff_part = 0.011431716637966029;
-      rc->gf_frame_max_boost = 81.00472969483079;
-      rc->gf_max_total_boost = MAX_GF_BOOST;
-      rc->kf_err_per_mb = 35931.25734431429;
-      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
-      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      rc->zm_power_factor = 5.5776463538431935;
+      twopass->active_wq_factor = 31.0;
+      twopass->base_err_per_mb = 34474.723463367416;
+      twopass->sr_default_decay_limit = 0.23346886902707745;
+      twopass->sr_diff_part = 0.011431716637966029;
+      twopass->gf_frame_max_boost = 81.00472969483079;
+      twopass->gf_max_total_boost = MAX_GF_BOOST;
+      twopass->kf_err_per_mb = 35931.25734431429;
+      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      twopass->zm_power_factor = 5.5776463538431935;
     } else {
-      rc->active_wq_factor = AV_WQ_FACTOR;
-      rc->base_err_per_mb = BASELINE_ERR_PER_MB;
-      rc->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
-      rc->sr_diff_part = SR_DIFF_PART;
-      rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
-      rc->gf_max_total_boost = MAX_GF_BOOST;
-      rc->kf_err_per_mb = DEF_EPMB_LOW;
-      rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-      rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2;
-      rc->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      rc->zm_power_factor = ZM_POWER_FACTOR;
+      twopass->active_wq_factor = AV_WQ_FACTOR;
+      twopass->base_err_per_mb = BASELINE_ERR_PER_MB;
+      twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+      twopass->sr_diff_part = SR_DIFF_PART;
+      twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+      twopass->gf_max_total_boost = MAX_GF_BOOST;
+      twopass->kf_err_per_mb = 250.0;
+      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+      twopass->zm_power_factor = ZM_POWER_FACTOR;
     }
   }
 }
@@ -3596,7 +3609,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   if (cm->current_video_frame == 0) {
     unsigned int screen_area = (cm->width * cm->height);
 
-    init_vizier_params(rc, screen_area);
+    init_vizier_params(twopass, screen_area);
   }
 
   // If this is an arf frame then we dont want to read the stats file or
@@ -3862,9 +3875,9 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame,
 
   *first_is_key_frame = 0;
   if (rc.frames_to_key == 0) {
-    rc.frames_to_key = vp9_get_frames_to_next_key(
-        &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info,
-        *first_show_idx, rc.min_gf_interval);
+    rc.frames_to_key =
+        vp9_get_frames_to_next_key(&cpi->oxcf, &cpi->frame_info, twopass,
+                                   *first_show_idx, rc.min_gf_interval);
     rc.frames_since_key = 0;
     *first_is_key_frame = 1;
   }
@@ -3903,9 +3916,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
     gop_intra_factor = 1.0;
   }
 
-  frame_count = get_gop_coding_frame_num(
-      use_alt_ref, frame_info, first_pass_info, rc, show_idx,
-      &active_gf_interval, gop_intra_factor, oxcf->lag_in_frames);
+  frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc,
+                                         show_idx, &active_gf_interval,
+                                         gop_intra_factor, oxcf->lag_in_frames);
   *use_alt_ref &= allow_alt_ref;
   return frame_count;
 }
@@ -3929,7 +3942,7 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
     int first_is_key_frame = 0;
     if (rc.frames_to_key == 0) {
       rc.frames_to_key = vp9_get_frames_to_next_key(
-          oxcf, frame_info, first_pass_info, show_idx, rc.min_gf_interval);
+          oxcf, frame_info, twopass, show_idx, rc.min_gf_interval);
       rc.frames_since_key = 0;
       first_is_key_frame = 1;
     }
@@ -3951,8 +3964,7 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
 
 void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
                            const FRAME_INFO *frame_info,
-                           const FIRST_PASS_INFO *first_pass_info,
-                           int *key_frame_map) {
+                           const TWO_PASS *const twopass, int *key_frame_map) {
   int show_idx = 0;
   RATE_CONTROL rc;
   vp9_rc_init(oxcf, 1, &rc);
@@ -3966,7 +3978,7 @@ void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
     int key_frame_group_size;
     key_frame_map[show_idx] = 1;
     key_frame_group_size = vp9_get_frames_to_next_key(
-        oxcf, frame_info, first_pass_info, show_idx, rc.min_gf_interval);
+        oxcf, frame_info, twopass, show_idx, rc.min_gf_interval);
     assert(key_frame_group_size > 0);
     show_idx += key_frame_group_size;
   }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index b1047eab22..6a347c8b14 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -221,6 +221,19 @@ typedef struct {
   int last_qindex_of_arf_layer[MAX_ARF_LAYERS];
 
   GF_GROUP gf_group;
+
+  // Vizeir project experimental two pass rate control parameters.
+  double active_wq_factor;
+  double base_err_per_mb;
+  double sr_default_decay_limit;
+  double sr_diff_part;
+  double kf_err_per_mb;
+  double kf_frame_max_boost_first;  // Max for first kf in a chunk.
+  double kf_frame_max_boost_subs;   // Max for subsequent mid chunk kfs.
+  int kf_max_total_boost;
+  int gf_max_total_boost;
+  double gf_frame_max_boost;
+  double zm_power_factor;
 } TWO_PASS;
 
 struct VP9_COMP;
@@ -249,8 +262,8 @@ void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width,
 struct VP9EncoderConfig;
 int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf,
                                const FRAME_INFO *frame_info,
-                               const FIRST_PASS_INFO *first_pass_info,
-                               int kf_show_idx, int min_gf_interval);
+                               const TWO_PASS *const twopass, int kf_show_idx,
+                               int min_gf_interval);
 #if CONFIG_RATE_CTRL
 /* Call this function to get info about the next group of pictures.
  * This function should be called after vp9_create_compressor() when encoding
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 7437d309e0..0120f90a01 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -204,19 +204,6 @@ typedef struct {
   int preserve_arf_as_gld;
   int preserve_next_arf_as_gld;
   int show_arf_as_gld;
-
-  // Vizeir project experimental rate control parameters.
-  double active_wq_factor;
-  double base_err_per_mb;
-  double sr_default_decay_limit;
-  double sr_diff_part;
-  double kf_frame_max_boost_first;  // Max for first kf in a chunk.
-  double kf_frame_max_boost_subs;   // Max for subsequent mid chunk kfs.
-  double kf_max_total_boost;
-  double kf_err_per_mb;
-  double gf_frame_max_boost;
-  double gf_max_total_boost;
-  double zm_power_factor;
 } RATE_CONTROL;
 
 struct VP9_COMP;

From 36013909a5ac97bf6f08e28e9471261ff8e133d3 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 8 Mar 2021 12:25:31 -0800
Subject: [PATCH 055/926] L2E: let vp9 encoder respect external max frame size
 constraint

Change-Id: Ib926e694d4bc4675af1435a32f6316a587756380
---
 vp9/encoder/vp9_encoder.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index ecd15cb011..34646465a6 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4398,6 +4398,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
+  int last_q_attempt = 0;
   int enable_acl;
 #ifdef AGGRESSIVE_VBR
   int qrange_adj = 1;
@@ -4413,6 +4414,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   // passed in by the external rate control model.
   // case: -1, we take VP9's decision for the max frame size.
   int ext_rc_max_frame_size = 0;
+  const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
 
 #if CONFIG_RATE_CTRL
   const FRAME_UPDATE_TYPE update_type =
@@ -4580,6 +4582,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 
     if (cpi->ext_ratectrl.ready) {
+      last_q_attempt = q;
       // In general, for the external rate control, we take the qindex provided
       // as input and encode the frame with this qindex faithfully. However,
       // in some extreme scenarios, the provided qindex leads to a massive
@@ -4597,6 +4600,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
           break;
         }
       }
+      rc->max_frame_bandwidth = ext_rc_max_frame_size;
+      // If the current frame size exceeds the ext_rc_max_frame_size,
+      // we adjust the worst qindex to meet the frame size constraint.
+      q_high = 255;
       ext_rc_recode = 1;
     }
 #if CONFIG_RATE_CTRL
@@ -4796,6 +4803,23 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
         rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
+    // Special handling of external max frame size constraint
+    if (ext_rc_recode) {
+      // If the largest q is not able to meet the max frame size limit,
+      // do nothing.
+      if (rc->projected_frame_size > ext_rc_max_frame_size &&
+          last_q_attempt == 255) {
+        break;
+      }
+      // If VP9's q selection leads to a smaller q, we force it to use
+      // a larger q to better approximate the external max frame size
+      // constraint.
+      if (rc->projected_frame_size > ext_rc_max_frame_size &&
+          q <= last_q_attempt) {
+        q = VPXMIN(255, last_q_attempt + 1);
+      }
+    }
+
     if (loop) {
       ++loop_count;
       ++loop_at_this_size;
@@ -4809,6 +4833,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       if (loop) restore_coding_context(cpi);
   } while (loop);
 
+  rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth;
+
 #ifdef AGGRESSIVE_VBR
   if (two_pass_first_group_inter(cpi)) {
     cpi->twopass.active_worst_quality =

From cc3444f01c448f1cf6acdd283d65e7ec5d0a9fdd Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Tue, 9 Mar 2021 14:07:48 +0000
Subject: [PATCH 056/926] Vizier: Add defaults for > 1080P

Previous code did not have sensible defaults for larger image formats.

Added defaults for Vizier RD parameters for sizes > 1080P and changed
the first pass parameters for large formats to use the 1080P values.
No supplied value  for rd_mult_q_sq_key_high_qp case yet so set to
old hard wired default value.

If the Vizier parameters were enabled the lack of sensible defaults
caused a large regression for 2K clips in one of our test sets.

Change-Id: I306c0cd76eab00d50880c91fadb5842faf6661ff
---
 vp9/encoder/vp9_firstpass.c | 14 +-------------
 vp9/encoder/vp9_rd.c        | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 6d6aa5087f..7c67efe4b7 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3567,7 +3567,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
       twopass->zm_power_factor = 2.568627575572356;
-    } else if (screen_area <= 1920 * 1080) {
+    } else {
       twopass->active_wq_factor = 31.0;
       twopass->base_err_per_mb = 34474.723463367416;
       twopass->sr_default_decay_limit = 0.23346886902707745;
@@ -3579,18 +3579,6 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
       twopass->zm_power_factor = 5.5776463538431935;
-    } else {
-      twopass->active_wq_factor = AV_WQ_FACTOR;
-      twopass->base_err_per_mb = BASELINE_ERR_PER_MB;
-      twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
-      twopass->sr_diff_part = SR_DIFF_PART;
-      twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
-      twopass->gf_max_total_boost = MAX_GF_BOOST;
-      twopass->kf_err_per_mb = 250.0;
-      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_power_factor = ZM_POWER_FACTOR;
     }
   }
 }
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index b126d8708d..3b2e0b088b 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -206,6 +206,7 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) {
   // Make sure this function is floating point safe.
   vpx_clear_system_state();
 
+  rdc->rd_mult_q_sq_key_high_qp = 7.5;  // No defined Vizer values yet
   if (1) {
     // Non/pre-Vizer defaults
     rdc->rd_mult_q_sq_inter_low_qp = 4.0;
@@ -214,49 +215,48 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) {
     rdc->rd_mult_q_sq_key_ultralow_qp = 4.0;
     rdc->rd_mult_q_sq_key_low_qp = 3.5;
     rdc->rd_mult_q_sq_key_mid_qp = 4.5;
-    rdc->rd_mult_q_sq_key_high_qp = 7.5;
   } else if (screen_area <= 176 * 144) {
-    rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044;
     rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025;
     rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739;
+    rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333;
     rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155;
     rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333;
   } else if (screen_area <= 320 * 240) {
-    rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458;
     rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102;
     rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181;
+    rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166;
     rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771;
     rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166;
   } else if (screen_area <= 640 * 360) {
-    rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025;
     rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013;
     rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551;
+    rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747;
     rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526;
     rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747;
   } else if (screen_area <= 854 * 480) {
-    rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539;
     rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073;
     rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201;
+    rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566;
     rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473;
     rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566;
   } else if (screen_area <= 1280 * 720) {
-    rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541;
     rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107;
     rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538;
+    rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326;
     rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484;
     rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326;
-  } else if (screen_area <= 1920 * 1080) {
-    rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797;
+  } else {
     rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199;
     rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023;
+    rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797;
+    rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089;
     rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298;
     rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089;
   }
 }
 

From 8851ed5787b6e30b1f212b171da6cbaa9778a799 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Tue, 9 Mar 2021 14:47:25 +0000
Subject: [PATCH 057/926] Vizier: Add in field for min kf frame boost.

Added kf_frame_min_boost field to hold the minimum per frame
boost in key frame boost calculations. Replaces hard wired value.
To be used in conjunction with and tied to the maximum value.

Change-Id: I67a39ecb3f21b5918512a5ccd9a1b214d7971e45
---
 vp9/encoder/vp9_firstpass.c | 12 +++++++++++-
 vp9/encoder/vp9_firstpass.h |  1 +
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 7c67efe4b7..d1044d7815 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -65,9 +65,11 @@
 #define GF_MAX_FRAME_BOOST 96.0
 
 #ifdef AGGRESSIVE_VBR
+#define KF_MIN_FRAME_BOOST 40.0
 #define KF_MAX_FRAME_BOOST 80.0
 #define MAX_KF_TOT_BOOST 4800
 #else
+#define KF_MIN_FRAME_BOOST 40.0
 #define KF_MAX_FRAME_BOOST 96.0
 #define MAX_KF_TOT_BOOST 5400
 #endif
@@ -2017,7 +2019,8 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
   // The 40.0 value here is an experimentally derived baseline minimum.
   // This value is in line with the minimum per frame boost in the alt_ref
   // boost calculation.
-  frame_boost = ((frame_boost + 40.0) * boost_q_correction);
+  frame_boost =
+      ((frame_boost + twopass->kf_frame_min_boost) * boost_q_correction);
 
   // Maximum allowed boost this frame. May be different for first vs subsequent
   // key frames.
@@ -3500,6 +3503,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
     } else {
       twopass->kf_err_per_mb = 250.0;
     }
+    twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
     twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
     twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
     twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
@@ -3515,6 +3519,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_frame_max_boost = 87.27362648627846;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 1854.8255436877148;
+      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
       twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
@@ -3527,6 +3532,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_frame_max_boost = 127.34978204980285;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 723.8337508755031;
+      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
       twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
@@ -3539,6 +3545,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_frame_max_boost = 75.17672317013668;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 422.2871502380377;
+      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
       twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
@@ -3551,6 +3558,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_frame_max_boost = 85.2868528581522;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 1513.4883914008383;
+      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
       twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
@@ -3563,6 +3571,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 998.6342911785146;
+      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
       twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
@@ -3575,6 +3584,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_frame_max_boost = 81.00472969483079;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 35931.25734431429;
+      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
       twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 6a347c8b14..9613f57fb9 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -228,6 +228,7 @@ typedef struct {
   double sr_default_decay_limit;
   double sr_diff_part;
   double kf_err_per_mb;
+  double kf_frame_min_boost;
   double kf_frame_max_boost_first;  // Max for first kf in a chunk.
   double kf_frame_max_boost_subs;   // Max for subsequent mid chunk kfs.
   int kf_max_total_boost;

From cbc4ead58691c899262513025a83f03f5932e50e Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Tue, 9 Mar 2021 15:11:41 +0000
Subject: [PATCH 058/926] Vizer: Added in experimental max KF boost values.

Added the experimental max per frame KF boost values derived from
the Vizier experiments.

These are still all off by default.

When enabled I expect these to cause significant regression as they
fluctuate wildly and in a way that makes no sense from format to format.

I suspect these values reflect over fitting perhaps from a subset of
training clips with more frequent mid chunk key frames and or short key
frame groups.

Also fixed incorrect value for gf boost for one format.

Experiment to moderate these values and use different values for first
and subsequent KF groups to follow.

Change-Id: Ibeb4268957f2edacdb4549d74930255a22a2fcc5
---
 vp9/encoder/vp9_firstpass.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index d1044d7815..8c771c7ffc 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3520,7 +3520,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 1854.8255436877148;
       twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_first = 25.5;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
       twopass->zm_power_factor = 2.93715229184991;
@@ -3533,7 +3533,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 723.8337508755031;
       twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_first = 185.0;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
       twopass->zm_power_factor = 3.5299221493593413;
@@ -3546,7 +3546,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 422.2871502380377;
       twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_first = 224.5;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
       twopass->zm_power_factor = 2.265742666649307;
@@ -3559,7 +3559,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 1513.4883914008383;
       twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_first = 28.0;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
       twopass->zm_power_factor = 3.552278528517416;
@@ -3568,11 +3568,11 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->base_err_per_mb = 29527.46375825401;
       twopass->sr_default_decay_limit = 0.5009117586299728;
       twopass->sr_diff_part = 0.005007364627260114;
-      twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+      twopass->gf_frame_max_boost = 81.00472969483079;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 998.6342911785146;
       twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_first = 53.0;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
       twopass->zm_power_factor = 2.568627575572356;
@@ -3581,11 +3581,11 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->base_err_per_mb = 34474.723463367416;
       twopass->sr_default_decay_limit = 0.23346886902707745;
       twopass->sr_diff_part = 0.011431716637966029;
-      twopass->gf_frame_max_boost = 81.00472969483079;
+      twopass->gf_frame_max_boost = 213.2940230360479;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 35931.25734431429;
       twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+      twopass->kf_frame_max_boost_first = 419.5;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
       twopass->zm_power_factor = 5.5776463538431935;

From 24b43c4ea5c2b54e32b107921cb06e89a71f916e Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 8 Mar 2021 16:07:02 -0800
Subject: [PATCH 059/926] Prepare for v1.10.0 release.

Update CHANGELOG, AUTHORS, README, libs.mk

Bug: webm:1712
Change-Id: Ic99de12b91a92c32f8a9485dcb759c48bc3eccd6
---
 .mailmap  |  2 ++
 AUTHORS   |  5 +++++
 CHANGELOG | 30 ++++++++++++++++++++++++++++++
 README    | 17 +++++++++--------
 libs.mk   | 14 +++++++++++++-
 5 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/.mailmap b/.mailmap
index 8d97500994..376ca83ae3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -12,6 +12,8 @@ Deb Mukherjee <debargha@google.com>
 Elliott Karpilovsky <elliottk@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Fyodor Kyslov <kyslov@google.com>
+Gregor Jasny <gjasny@gmail.com>
+Gregor Jasny <gjasny@gmail.com> <gjasny@googlemail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hui Su <huisu@google.com>
diff --git a/AUTHORS b/AUTHORS
index 352c91feda..e804842f78 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -37,6 +37,7 @@ Christian Duvivier <cduvivier@google.com>
 Clement Courbet <courbet@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
+Daniel Sommermann <dcsommer@gmail.com>
 Dan Zhu <zxdan@google.com>
 Deb Mukherjee <debargha@google.com>
 Deepa K G <deepa.kg@ittiam.com>
@@ -73,6 +74,7 @@ Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
 Jacky Chen <jackychen@google.com>
 James Berry <jamesberry@google.com>
+James Touton <bekenn@gmail.com>
 James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
 Jan Gerber <j@mailb.org>
@@ -82,11 +84,14 @@ Jean-Yves Avenard <jyavenard@mozilla.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jeremy Leconte <jleconte@google.com>
 Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
+jinbo <jinbo-hf@loongson.cn>
 Jingning Han <jingning@google.com>
+Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
diff --git a/CHANGELOG b/CHANGELOG
index e731fc6121..6338caa380 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,33 @@
+2021-03-09 v1.10.0 "Ruddy Duck"
+  This maintenance release adds support for darwin20 and new codec controls, as
+  well as numerous bug fixes.
+
+  - Upgrading:
+    New codec control is added to disable loopfilter for VP9.
+
+    New encoder control is added to disable feature to increase Q on overshoot
+    detection for CBR.
+
+    Configure support for darwin20 is added.
+
+    New codec control is added for VP9 rate control. The control ID of this
+    interface is VP9E_SET_EXTERNAL_RATE_CONTROL. To make VP9 use a customized
+    external rate control model, users will have to implement each callback
+    function in vpx_rc_funcs_t and register them using libvpx API
+    vpx_codec_control_() with the control ID.
+
+  - Enhancement:
+    Use -std=gnu++11 instead of -std=c++11 for c++ files.
+
+  - Bug fixes:
+    Override assembler with --as option of configure for MSVS.
+    Fix several compilation issues with gcc 4.8.5.
+    Fix to resetting rate control for temporal layers.
+    Fix to the rate control stats of SVC example encoder when number of spatial
+    layers is 1.
+    Fix to reusing motion vectors from the base spatial layer in SVC.
+    2 pass related flags removed from SVC example encoder.
+
 2020-07-29 v1.9.0 "Quacking Duck"
   This release adds support for NV12, a separate library for rate control, as
   well as incremental improvements.
diff --git a/README b/README
index 62fef34593..ddbcb9f695 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README - 20 July 2020
+README - 08 March 2021
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -10,14 +10,14 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   1. Prerequisites
 
     * All x86 targets require the Yasm[1] assembler be installed[2].
-    * All Windows builds require that Cygwin[3] be installed.
-    * Building the documentation requires Doxygen[4]. If you do not
+    * All Windows builds require that Cygwin[3] or MSYS2[4] be installed.
+    * Building the documentation requires Doxygen[5]. If you do not
       have this package, the install-docs option will be disabled.
-    * Downloading the data for the unit tests requires curl[5] and sha1sum.
+    * Downloading the data for the unit tests requires curl[6] and sha1sum.
       sha1sum is provided via the GNU coreutils, installed by default on
       many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
       available, a compatible version of sha1sum can be built from
-      source[6]. These requirements are optional if not running the unit
+      source[7]. These requirements are optional if not running the unit
       tests.
 
     [1]: http://www.tortall.net/projects/yasm
@@ -26,9 +26,10 @@ COMPILING THE APPLICATIONS/LIBRARIES:
          yasm-<version>-<arch>.exe to yasm.exe and place it in:
          Program Files (x86)/Microsoft Visual Studio/2017/<level>/Common7/Tools/
     [3]: http://www.cygwin.com
-    [4]: http://www.doxygen.org
-    [5]: http://curl.haxx.se
-    [6]: http://www.microbrew.org/tools/md5sha1sum/
+    [4]: http://www.msys2.org/
+    [5]: http://www.doxygen.org
+    [6]: http://curl.haxx.se
+    [7]: http://www.microbrew.org/tools/md5sha1sum/
 
   2. Out-of-tree builds
   Out of tree builds are a supported method of building the application. For
diff --git a/libs.mk b/libs.mk
index cabd4ed141..d05eee966d 100644
--- a/libs.mk
+++ b/libs.mk
@@ -287,8 +287,20 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
+# Updating version info.
+# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+# For libtool: c=<current>, a=<age>, r=<revision>
+# libtool generates .so file as .so.[c-a].a.r, while -version-info c:r:a is
+# passed to libtool.
+#
+# libvpx library file is generated as libvpx.so.<MAJOR>.<MINOR>.<PATCH>
+# MAJOR = c-a, MINOR = a, PATCH = r
+#
+# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
+# SO_VERSION_* then follow the rules in the link to detemine the new version
+# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
 SO_VERSION_MAJOR := 6
-SO_VERSION_MINOR := 3
+SO_VERSION_MINOR := 4
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib

From 973726c38bb0fd64fb14b1c95caabe52bd39d17d Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Fri, 12 Mar 2021 09:54:03 -0800
Subject: [PATCH 060/926] vp9-rtc: Add postencode_drop control to sample
 encoder

Change-Id: I1c989f26b0a7b9239adf37df8d96776f33b89a8b
---
 examples/vpx_temporal_svc_encoder.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 04212e5d7d..ad3e79c713 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -831,6 +831,7 @@ int main(int argc, char **argv) {
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
+    vpx_codec_control(&codec, VP9E_SET_POSTENCODE_DROP, 0);
     vpx_codec_control(&codec, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0);
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);

From 04086a30664d2a3e89d6a6e4e1c18f1a82c8f958 Mon Sep 17 00:00:00 2001
From: "Adam B. Goode" <adam.mckee84@gmail.com>
Date: Wed, 17 Mar 2021 14:11:57 -0500
Subject: [PATCH 061/926] Msvc builds convert to windows path w/msys env

Bug: webm:1720
Change-Id: I56689ad408f8086c511e1711dfa9c8d404727b2e
---
 build/make/msvs_common.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh
index 27ddf7fd91..3989fec0d5 100644
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@@ -9,7 +9,8 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
+shell_name="$(uname -o 2>/dev/null)"
+if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \
    && cygpath --help >/dev/null 2>&1; then
     FIXPATH='cygpath -m'
 else

From b5e754a840511c9956c033561955745a184495cb Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Wed, 10 Mar 2021 14:39:43 +0000
Subject: [PATCH 062/926] Change SR_diff calculation and representation

This patch changes the way prediction decay is calculated.

We expect that frames that are further from an ALT-REF frame (or Golden
 Frame) will be less well predicted by that ALT-REF frame. As such it is
desirable that they should contribute less to the boost calculation used
to assign bits to the ALT_REF.

This code looks at the reduction in prediction quality between the last
frame and the second reference frame (usually two frames old). We make
the assumption that we can accumulate this to get a proxy for the likely
loss of prediction quality over multiple frames.

Previously the calculation looked at the absolute difference in the
coded errors. The issue here is that the meaning of a unit difference
is not the same for very complex frames as it is for easy frames.

In this patch we scale the decay value based on how the error difference
compares to the overall frame complexity as represented by the intra
coding error.

This was tuned experimentally to give  test results that
were approximately neutral for our various test sets. There was
 a slight drop in Overall PSNR but a consistent improvement in
SSIM. This balance may be improved with tuning further as it is
noteworthy that it was much better on the hd_res set.

Results (Overall PSNR, SSIM -ve better) for low_res, ugc360, midres2,
ugc480P and hd_res are as follows:

0.173	-0.688
0.118	-0.153
0.132	-0.239
0.261	-0.405
-0.305	-1.109

As part of this adjustment the contribution of motion amplitude was
removed.

This patch also changes the control mechanism that will be exposed
on the command line for use by the Vizier project. The control is now
a linear factor which defaults to 1.0, where values < 1.0 mean a lower
decay rate and values > 1.0 mean an increased decay rate.

This presents a more easily understandable interface for use in
optimizing the decay behavior for various formats, where it is clear
what a passed in value means relative to the default.

With the new decay mechanism the current values for various formats
are almost certainly wrong and we still need to define sensible upper
and lower bounds for use during future training.

Change-Id: Ib1074bbea97c725cdbf25772ee8ed66831461ce3
---
 vp9/encoder/vp9_firstpass.c | 90 ++++++++++++++++-------------------
 vp9/encoder/vp9_firstpass.h |  4 +-
 vp9/encoder/vp9_rd.c        | 94 +++++++++++++++++++------------------
 3 files changed, 91 insertions(+), 97 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 8c771c7ffc..a43099e946 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -58,7 +58,6 @@
 #define INTRA_PART 0.005
 #define DEFAULT_DECAY_LIMIT 0.75
 #define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
 #define LOW_CODED_ERR_PER_MB 10.0
 #define NCOUNT_FRAME_II_THRESH 6.0
 #define BASELINE_ERR_PER_MB 12500.0
@@ -1833,17 +1832,21 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   twopass->arnr_strength_adjustment = 0;
 }
 
-static double get_sr_decay_rate(const FRAME_INFO *frame_info,
-                                const TWO_PASS *const twopass,
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It comapres the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitframe boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const TWO_PASS *const twopass,
                                 const FIRSTPASS_STATS *frame) {
   double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
   double modified_pct_inter;
   double modified_pcnt_intra;
-  const double motion_amplitude_part =
-      frame->pcnt_motion *
-      ((frame->mvc_abs + frame->mvr_abs) /
-       (frame_info->frame_height + frame_info->frame_width));
 
   modified_pct_inter = frame->pcnt_inter;
   if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
@@ -1855,29 +1858,26 @@ static double get_sr_decay_rate(const FRAME_INFO *frame_info,
   modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
 
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (twopass->sr_diff_part * sr_diff) - motion_amplitude_part -
-               (INTRA_PART * modified_pcnt_intra);
+    double sr_diff_part =
+        twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error);
+    sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
   }
   return VPXMAX(sr_decay, twopass->sr_default_decay_limit);
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const FRAME_INFO *frame_info,
-                                     const TWO_PASS *const twopass,
+static double get_zero_motion_factor(const TWO_PASS *const twopass,
                                      const FIRSTPASS_STATS *frame_stats) {
   const double zero_motion_pct =
       frame_stats->pcnt_inter - frame_stats->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(frame_info, twopass, frame_stats);
+  double sr_decay = get_sr_decay_rate(twopass, frame_stats);
   return VPXMIN(sr_decay, zero_motion_pct);
 }
 
-static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
-                                        const TWO_PASS *const twopass,
+static double get_prediction_decay_rate(const TWO_PASS *const twopass,
                                         const FIRSTPASS_STATS *frame_stats) {
-  const double sr_decay_rate =
-      get_sr_decay_rate(frame_info, twopass, frame_stats);
+  const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats);
   const double zero_motion_factor =
       (0.95 * pow((frame_stats->pcnt_inter - frame_stats->pcnt_motion),
                   twopass->zm_power_factor));
@@ -2066,8 +2066,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *=
-          get_prediction_decay_rate(frame_info, twopass, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
@@ -2107,8 +2106,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *=
-          get_prediction_decay_rate(frame_info, twopass, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
@@ -2606,16 +2604,14 @@ static int get_gop_coding_frame_num(
 
     // Monitor for static sections.
     if ((rc->frames_since_key + gop_coding_frames - 1) > 1) {
-      zero_motion_accumulator =
-          VPXMIN(zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, twopass, next_frame));
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(twopass, next_frame));
     }
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       double last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate =
-          get_prediction_decay_rate(frame_info, twopass, next_frame);
+      loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -3181,7 +3177,6 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
 #define KF_ABS_ZOOM_THRESH 6.0
 
 int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
-                               const FRAME_INFO *frame_info,
                                const TWO_PASS *const twopass, int kf_show_idx,
                                int min_gf_interval) {
   const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
@@ -3211,8 +3206,7 @@ int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
           break;
 
         // How fast is the prediction quality decaying?
-        loop_decay_rate =
-            get_prediction_decay_rate(frame_info, twopass, next_frame);
+        loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
 
         // We want to know something about the recent past... rather than
         // as used elsewhere where we are concerned with decay in prediction
@@ -3298,8 +3292,8 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   kf_mod_err = calc_norm_frame_score(oxcf, frame_info, keyframe_stats,
                                      mean_mod_score, av_err);
 
-  rc->frames_to_key = vp9_get_frames_to_next_key(
-      oxcf, frame_info, twopass, kf_show_idx, rc->min_gf_interval);
+  rc->frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, kf_show_idx,
+                                                 rc->min_gf_interval);
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
@@ -3379,9 +3373,9 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
       // Monitor for static sections.
       // First frame in kf group the second ref indicator is invalid.
       if (i > 0) {
-        zero_motion_accumulator = VPXMIN(
-            zero_motion_accumulator,
-            get_zero_motion_factor(&cpi->frame_info, twopass, &next_frame));
+        zero_motion_accumulator =
+            VPXMIN(zero_motion_accumulator,
+                   get_zero_motion_factor(twopass, &next_frame));
       } else {
         zero_motion_accumulator =
             next_frame.pcnt_inter - next_frame.pcnt_motion;
@@ -3493,7 +3487,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
     twopass->active_wq_factor = AV_WQ_FACTOR;
     twopass->base_err_per_mb = BASELINE_ERR_PER_MB;
     twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
-    twopass->sr_diff_part = SR_DIFF_PART;
+    twopass->sr_diff_factor = 1.0;
     twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
     twopass->gf_max_total_boost = MAX_GF_BOOST;
     if (screen_area < 1280 * 720) {
@@ -3515,7 +3509,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->active_wq_factor = 46.0;
       twopass->base_err_per_mb = 37597.399760969536;
       twopass->sr_default_decay_limit = 0.3905639800962774;
-      twopass->sr_diff_part = 0.009599023654146284;
+      twopass->sr_diff_factor = 6.4;
       twopass->gf_frame_max_boost = 87.27362648627846;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 1854.8255436877148;
@@ -3528,7 +3522,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->active_wq_factor = 55.0;
       twopass->base_err_per_mb = 34525.33177195309;
       twopass->sr_default_decay_limit = 0.23901360046804604;
-      twopass->sr_diff_part = 0.008581014394766773;
+      twopass->sr_diff_factor = 5.73;
       twopass->gf_frame_max_boost = 127.34978204980285;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 723.8337508755031;
@@ -3541,7 +3535,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->active_wq_factor = 12.5;
       twopass->base_err_per_mb = 18823.978018028298;
       twopass->sr_default_decay_limit = 0.6043527690301296;
-      twopass->sr_diff_part = 0.00343296783885544;
+      twopass->sr_diff_factor = 2.28;
       twopass->gf_frame_max_boost = 75.17672317013668;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 422.2871502380377;
@@ -3554,7 +3548,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->active_wq_factor = 51.5;
       twopass->base_err_per_mb = 33718.98307662595;
       twopass->sr_default_decay_limit = 0.33633414970713393;
-      twopass->sr_diff_part = 0.00868988716928333;
+      twopass->sr_diff_factor = 5.8;
       twopass->gf_frame_max_boost = 85.2868528581522;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 1513.4883914008383;
@@ -3567,7 +3561,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->active_wq_factor = 41.5;
       twopass->base_err_per_mb = 29527.46375825401;
       twopass->sr_default_decay_limit = 0.5009117586299728;
-      twopass->sr_diff_part = 0.005007364627260114;
+      twopass->sr_diff_factor = 3.33;
       twopass->gf_frame_max_boost = 81.00472969483079;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 998.6342911785146;
@@ -3580,7 +3574,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->active_wq_factor = 31.0;
       twopass->base_err_per_mb = 34474.723463367416;
       twopass->sr_default_decay_limit = 0.23346886902707745;
-      twopass->sr_diff_part = 0.011431716637966029;
+      twopass->sr_diff_factor = 7.6;
       twopass->gf_frame_max_boost = 213.2940230360479;
       twopass->gf_max_total_boost = MAX_GF_BOOST;
       twopass->kf_err_per_mb = 35931.25734431429;
@@ -3873,9 +3867,8 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame,
 
   *first_is_key_frame = 0;
   if (rc.frames_to_key == 0) {
-    rc.frames_to_key =
-        vp9_get_frames_to_next_key(&cpi->oxcf, &cpi->frame_info, twopass,
-                                   *first_show_idx, rc.min_gf_interval);
+    rc.frames_to_key = vp9_get_frames_to_next_key(
+        &cpi->oxcf, twopass, *first_show_idx, rc.min_gf_interval);
     rc.frames_since_key = 0;
     *first_is_key_frame = 1;
   }
@@ -3939,8 +3932,8 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
     int use_alt_ref;
     int first_is_key_frame = 0;
     if (rc.frames_to_key == 0) {
-      rc.frames_to_key = vp9_get_frames_to_next_key(
-          oxcf, frame_info, twopass, show_idx, rc.min_gf_interval);
+      rc.frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, show_idx,
+                                                    rc.min_gf_interval);
       rc.frames_since_key = 0;
       first_is_key_frame = 1;
     }
@@ -3961,7 +3954,6 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
 }
 
 void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
-                           const FRAME_INFO *frame_info,
                            const TWO_PASS *const twopass, int *key_frame_map) {
   int show_idx = 0;
   RATE_CONTROL rc;
@@ -3975,8 +3967,8 @@ void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
   while (show_idx < first_pass_info->num_frames) {
     int key_frame_group_size;
     key_frame_map[show_idx] = 1;
-    key_frame_group_size = vp9_get_frames_to_next_key(
-        oxcf, frame_info, twopass, show_idx, rc.min_gf_interval);
+    key_frame_group_size =
+        vp9_get_frames_to_next_key(oxcf, twopass, show_idx, rc.min_gf_interval);
     assert(key_frame_group_size > 0);
     show_idx += key_frame_group_size;
   }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 9613f57fb9..624fccd428 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -226,7 +226,7 @@ typedef struct {
   double active_wq_factor;
   double base_err_per_mb;
   double sr_default_decay_limit;
-  double sr_diff_part;
+  double sr_diff_factor;
   double kf_err_per_mb;
   double kf_frame_min_boost;
   double kf_frame_max_boost_first;  // Max for first kf in a chunk.
@@ -262,7 +262,6 @@ void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width,
 
 struct VP9EncoderConfig;
 int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf,
-                               const FRAME_INFO *frame_info,
                                const TWO_PASS *const twopass, int kf_show_idx,
                                int min_gf_interval);
 #if CONFIG_RATE_CTRL
@@ -311,7 +310,6 @@ int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf,
  * number of show frames in the video.
  */
 void vp9_get_key_frame_map(const struct VP9EncoderConfig *oxcf,
-                           const FRAME_INFO *frame_info,
                            const FIRST_PASS_INFO *first_pass_info,
                            int *key_frame_map);
 #endif  // CONFIG_RATE_CTRL
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 3b2e0b088b..d5d668f964 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -201,62 +201,66 @@ static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
 // Later this function will use passed in command line values.
 void vp9_init_rd_parameters(VP9_COMP *cpi) {
   RD_CONTROL *const rdc = &cpi->rd_ctrl;
-  unsigned int screen_area = (cpi->common.width * cpi->common.height);
 
   // Make sure this function is floating point safe.
   vpx_clear_system_state();
 
   rdc->rd_mult_q_sq_key_high_qp = 7.5;  // No defined Vizer values yet
-  if (1) {
-    // Non/pre-Vizer defaults
+
+  if (0) {
+    unsigned int screen_area = (cpi->common.width * cpi->common.height);
+
+    if (screen_area <= 176 * 144) {
+      rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025;
+      rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739;
+      rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044;
+      rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333;
+      rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155;
+      rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201;
+    } else if (screen_area <= 320 * 240) {
+      rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102;
+      rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181;
+      rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458;
+      rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166;
+      rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771;
+      rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735;
+    } else if (screen_area <= 640 * 360) {
+      rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013;
+      rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551;
+      rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025;
+      rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747;
+      rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526;
+      rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241;
+    } else if (screen_area <= 854 * 480) {
+      rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073;
+      rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201;
+      rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539;
+      rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566;
+      rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473;
+      rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635;
+    } else if (screen_area <= 1280 * 720) {
+      rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107;
+      rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538;
+      rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541;
+      rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326;
+      rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484;
+      rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739;
+    } else {
+      rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199;
+      rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023;
+      rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797;
+      rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089;
+      rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298;
+      rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692;
+    }
+  } else {
+    // For now force defaults unless testing
     rdc->rd_mult_q_sq_inter_low_qp = 4.0;
     rdc->rd_mult_q_sq_inter_mid_qp = 4.5;
     rdc->rd_mult_q_sq_inter_high_qp = 3.0;
     rdc->rd_mult_q_sq_key_ultralow_qp = 4.0;
     rdc->rd_mult_q_sq_key_low_qp = 3.5;
     rdc->rd_mult_q_sq_key_mid_qp = 4.5;
-  } else if (screen_area <= 176 * 144) {
-    rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025;
-    rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739;
-    rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333;
-    rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155;
-    rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201;
-  } else if (screen_area <= 320 * 240) {
-    rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102;
-    rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181;
-    rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166;
-    rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771;
-    rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735;
-  } else if (screen_area <= 640 * 360) {
-    rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013;
-    rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551;
-    rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747;
-    rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526;
-    rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241;
-  } else if (screen_area <= 854 * 480) {
-    rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073;
-    rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201;
-    rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566;
-    rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473;
-    rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635;
-  } else if (screen_area <= 1280 * 720) {
-    rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107;
-    rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538;
-    rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326;
-    rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484;
-    rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739;
-  } else {
-    rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199;
-    rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023;
-    rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089;
-    rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298;
-    rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692;
   }
 }
 

From b41ffb53f1000ab2227c1736d8c1355aa5081c40 Mon Sep 17 00:00:00 2001
From: "Adam B. Goode" <adam.mckee84@gmail.com>
Date: Wed, 17 Mar 2021 14:11:57 -0500
Subject: [PATCH 063/926] Msvc builds convert to windows path w/msys env

Bug: webm:1720
Change-Id: I56689ad408f8086c511e1711dfa9c8d404727b2e
(cherry picked from commit 04086a30664d2a3e89d6a6e4e1c18f1a82c8f958)
---
 build/make/msvs_common.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh
index 27ddf7fd91..3989fec0d5 100644
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@@ -9,7 +9,8 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
+shell_name="$(uname -o 2>/dev/null)"
+if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \
    && cygpath --help >/dev/null 2>&1; then
     FIXPATH='cygpath -m'
 else

From e37ee40f7ee0dbafa41e7d1c32dc34740727c7a1 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Mon, 22 Mar 2021 19:45:10 +0000
Subject: [PATCH 064/926] Convert Vizier RD parameters to normalized factors

This patch converts the Vizier custom RD multipliers, to factors
that adjust each RD multiplier either side of its default value, where
a factor of 1.0 will give the previous default  behavior.

Ultimately I would like to replace the multiple RD multipliers
triggered at different Q thresholds (eg, low, medium, high q)
with a function that adjusts the rd behavior smoothly as Q
changes.

Vizier could then be presented with a single adjustment control
for each of key frame and inter frame rd.

The current behavior is problematic.

Firstly having hard threshold Q values at which rd behavior changes
may cause anomalies in the rate distortion curve, where in some
situations, raising  Q, for example,  may not cause the expected drop
in rate and rise in distortion, because we have crossed a threshold
where the rate distortion multiplier changes sharply and this alters
the balance of bits spent in the prediction and residual parts of the
signal.

Having a single  value that is used for a range of Q index values
(eg 0-64), (65-128)  may also cause problems and over-fitting in
the context of the Vizier ML project. This project tries to optimize
the values for each Q range, for various YT formats, but does so
by analyzing the results of single point encodes on a set of clips.
For a given format all the clips are encoded with the same parameters
(target rate etc) so  there is likely to be clustering in regards to the
Q values used. For example the training set may give a new value
for the Q  range 0-64 but most of the data points used may have Q
close 64.

It will likely require several iterations working with the Vizier team
to get this right. This patch just gives an initial framework for
testing.

Change-Id: Iaa4cd5561b95a202bcae7a1d876c4f40ef444fa2
---
 vp9/encoder/vp9_rd.c | 117 ++++++++++++++++++++++++-------------------
 vp9/encoder/vp9_rd.h |  17 +++----
 2 files changed, 74 insertions(+), 60 deletions(-)

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index d5d668f964..d3bf3d6d4c 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -205,65 +205,74 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) {
   // Make sure this function is floating point safe.
   vpx_clear_system_state();
 
-  rdc->rd_mult_q_sq_key_high_qp = 7.5;  // No defined Vizer values yet
+  rdc->rd_mult_key_high_qp_fac = 1.0;  // Default: no Vizer values yet
 
   if (0) {
     unsigned int screen_area = (cpi->common.width * cpi->common.height);
 
     if (screen_area <= 176 * 144) {
-      rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025;
-      rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739;
-      rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044;
-      rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333;
-      rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155;
-      rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201;
+      rdc->rd_mult_inter_low_qp_fac = 1.018;
+      rdc->rd_mult_inter_mid_qp_fac = 0.896;
+      rdc->rd_mult_inter_high_qp_fac = 1.432;
+      rdc->rd_mult_key_ultralow_qp_fac = 1.073;
+      rdc->rd_mult_key_low_qp_fac = 1.630;
+      rdc->rd_mult_key_mid_qp_fac = 1.050;
     } else if (screen_area <= 320 * 240) {
-      rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102;
-      rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181;
-      rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458;
-      rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166;
-      rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771;
-      rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735;
+      rdc->rd_mult_inter_low_qp_fac = 1.127;
+      rdc->rd_mult_inter_mid_qp_fac = 0.998;
+      rdc->rd_mult_inter_high_qp_fac = 1.463;
+      rdc->rd_mult_key_ultralow_qp_fac = 1.054;
+      rdc->rd_mult_key_low_qp_fac = 1.285;
+      rdc->rd_mult_key_mid_qp_fac = 0.952;
     } else if (screen_area <= 640 * 360) {
-      rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013;
-      rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551;
-      rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025;
-      rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747;
-      rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526;
-      rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241;
+      rdc->rd_mult_inter_low_qp_fac = 1.183;
+      rdc->rd_mult_inter_mid_qp_fac = 0.959;
+      rdc->rd_mult_inter_high_qp_fac = 1.457;
+      rdc->rd_mult_key_ultralow_qp_fac = 1.144;
+      rdc->rd_mult_key_low_qp_fac = 1.734;
+      rdc->rd_mult_key_mid_qp_fac = 1.071;
     } else if (screen_area <= 854 * 480) {
-      rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073;
-      rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201;
-      rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539;
-      rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566;
-      rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473;
-      rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635;
+      rdc->rd_mult_inter_low_qp_fac = 1.203;
+      rdc->rd_mult_inter_mid_qp_fac = 1.027;
+      rdc->rd_mult_inter_high_qp_fac = 1.027;
+      rdc->rd_mult_key_ultralow_qp_fac = 1.246;
+      rdc->rd_mult_key_low_qp_fac = 1.246;
+      rdc->rd_mult_key_mid_qp_fac = 1.280;
     } else if (screen_area <= 1280 * 720) {
-      rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107;
-      rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538;
-      rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541;
-      rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326;
-      rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484;
-      rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739;
+      rdc->rd_mult_inter_low_qp_fac = 1.280;
+      rdc->rd_mult_inter_mid_qp_fac = 1.004;
+      rdc->rd_mult_inter_high_qp_fac = 1.470;
+      rdc->rd_mult_key_ultralow_qp_fac = 0.987;
+      rdc->rd_mult_key_low_qp_fac = 1.671;
+      rdc->rd_mult_key_mid_qp_fac = 1.193;
     } else {
-      rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199;
-      rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023;
-      rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797;
-      rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089;
-      rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298;
-      rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692;
+      rdc->rd_mult_inter_low_qp_fac = 1.50;
+      rdc->rd_mult_inter_mid_qp_fac = 0.874;
+      rdc->rd_mult_inter_high_qp_fac = 1.07;
+      rdc->rd_mult_key_ultralow_qp_fac = 1.1;
+      rdc->rd_mult_key_low_qp_fac = 2.35;
+      rdc->rd_mult_key_mid_qp_fac = 0.837;
     }
   } else {
     // For now force defaults unless testing
-    rdc->rd_mult_q_sq_inter_low_qp = 4.0;
-    rdc->rd_mult_q_sq_inter_mid_qp = 4.5;
-    rdc->rd_mult_q_sq_inter_high_qp = 3.0;
-    rdc->rd_mult_q_sq_key_ultralow_qp = 4.0;
-    rdc->rd_mult_q_sq_key_low_qp = 3.5;
-    rdc->rd_mult_q_sq_key_mid_qp = 4.5;
+    rdc->rd_mult_inter_low_qp_fac = 1.0;
+    rdc->rd_mult_inter_mid_qp_fac = 1.0;
+    rdc->rd_mult_inter_high_qp_fac = 1.0;
+    rdc->rd_mult_key_ultralow_qp_fac = 1.0;
+    rdc->rd_mult_key_low_qp_fac = 1.0;
+    rdc->rd_mult_key_mid_qp_fac = 1.0;
   }
 }
 
+// Default Rd multiplier values for Q ranges
+#define INTER_LOW_QP_RDM 4.0
+#define INTER_MID_QP_RDM 4.5
+#define INTER_HIGH_QP_RDM 3.0
+#define KEY_ULOW_QP_RDM 4.0
+#define KEY_LOW_QP_RDM 3.5
+#define KEY_MID_QP_RDM 4.5
+#define KEY_HIGH_QP_RDM 7.5
+
 int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
   const RD_CONTROL *rdc = &cpi->rd_ctrl;
   const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
@@ -275,22 +284,28 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
 
   if (cpi->common.frame_type != KEY_FRAME) {
     if (qindex < 128) {
-      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_low_qp);
+      rdmult = (int)((double)rdmult * INTER_LOW_QP_RDM *
+                     rdc->rd_mult_inter_low_qp_fac);
     } else if (qindex < 190) {
-      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_mid_qp);
+      rdmult = (int)((double)rdmult * INTER_MID_QP_RDM *
+                     rdc->rd_mult_inter_mid_qp_fac);
     } else {
-      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_high_qp);
+      rdmult = (int)((double)rdmult * INTER_HIGH_QP_RDM *
+                     rdc->rd_mult_inter_high_qp_fac);
     }
   } else {
     if (qindex < 64) {
-      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_ultralow_qp);
+      rdmult = (int)((double)rdmult * KEY_ULOW_QP_RDM *
+                     rdc->rd_mult_key_ultralow_qp_fac);
     } else if (qindex <= 128) {
-      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_low_qp);
+      rdmult =
+          (int)((double)rdmult * KEY_LOW_QP_RDM * rdc->rd_mult_key_low_qp_fac);
     } else if (qindex < 190) {
-      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_mid_qp);
-
+      rdmult =
+          (int)((double)rdmult * KEY_MID_QP_RDM * rdc->rd_mult_key_mid_qp_fac);
     } else {
-      rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_high_qp);
+      rdmult = (int)((double)rdmult * KEY_HIGH_QP_RDM *
+                     rdc->rd_mult_key_high_qp_fac);
     }
   }
 
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 2c9f5e7408..4899e1ae0f 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -102,15 +102,14 @@ typedef enum {
 } THR_MODES_SUB8X8;
 
 typedef struct {
-  // RD control parameters
-  // Added for Vizier project.
-  double rd_mult_q_sq_inter_low_qp;
-  double rd_mult_q_sq_inter_mid_qp;
-  double rd_mult_q_sq_inter_high_qp;
-  double rd_mult_q_sq_key_ultralow_qp;
-  double rd_mult_q_sq_key_low_qp;
-  double rd_mult_q_sq_key_mid_qp;
-  double rd_mult_q_sq_key_high_qp;
+  // RD multiplier control factors added for Vizier project.
+  double rd_mult_inter_low_qp_fac;
+  double rd_mult_inter_mid_qp_fac;
+  double rd_mult_inter_high_qp_fac;
+  double rd_mult_key_ultralow_qp_fac;
+  double rd_mult_key_low_qp_fac;
+  double rd_mult_key_mid_qp_fac;
+  double rd_mult_key_high_qp_fac;
 } RD_CONTROL;
 
 typedef struct RD_OPT {

From deef8955067bcfb7bb9d3d9b1719f178ea8e37a2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 26 Mar 2021 10:55:29 -0700
Subject: [PATCH 065/926] vp9_ext_ratectrl_test: use uintptr_t for void* value

this avoids a warning about differences in size between void* and
unsigned int under msvc:
vp9_ext_ratectrl_test.cc(40,3): warning C4312: 'reinterpret_cast':
conversion from 'const unsigned int' to 'void *' of greater size

Change-Id: I5a412ec785ddcaeff2ec71bb83a6048505400293
---
 test/vp9_ext_ratectrl_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index b6b5b2eaec..60a350b84e 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <cstdint>
 #include <new>
 
 #include "test/codec_factory.h"
@@ -20,7 +21,7 @@
 namespace {
 
 constexpr int kModelMagicNumber = 51396;
-constexpr unsigned int PrivMagicNumber = 5566;
+constexpr uintptr_t PrivMagicNumber = 5566;
 constexpr int kFrameNum = 5;
 constexpr int kLosslessCodingIndex = 2;
 

From c19de35ed2b772fd21092c727c5e090034876d4f Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Tue, 16 Mar 2021 21:27:18 -0700
Subject: [PATCH 066/926] Add command line options for a few rc parameters

These rate control parameters are for the Vizier experiment.
They are defined as rational numbers.

Change-Id: I23f382dd49158db463b75b5ad8a82d8e0d536308
---
 vp8/vp8_cx_iface.c | 28 +++++++++++-----
 vp9/vp9_cx_iface.c | 28 +++++++++++-----
 vpx/vpx_encoder.h  | 84 ++++++++++++++++++++++++++++++++++++++++++++++
 vpxenc.c           | 71 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 195 insertions(+), 16 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 32dd3c7708..32bb1d04f2 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1281,14 +1281,26 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
 
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
         { 0 },
-        { 0 }, /* ss_target_bitrate */
-        1,     /* ts_number_layers */
-        { 0 }, /* ts_target_bitrate */
-        { 0 }, /* ts_rate_decimator */
-        0,     /* ts_periodicity */
-        { 0 }, /* ts_layer_id */
-        { 0 }, /* layer_target_bitrate */
-        0      /* temporal_layering_mode */
+        { 0 },    /* ss_target_bitrate */
+        1,        /* ts_number_layers */
+        { 0 },    /* ts_target_bitrate */
+        { 0 },    /* ts_rate_decimator */
+        0,        /* ts_periodicity */
+        { 0 },    /* ts_layer_id */
+        { 0 },    /* layer_target_bitrate */
+        0,        /* temporal_layering_mode */
+        { 0, 0 }, /* active_wq_factor */
+        { 0, 0 }, /* base_err_per_mb */
+        { 0, 0 }, /* sr_default_decay_limit */
+        { 0, 0 }, /* sr_diff_factor */
+        { 0, 0 }, /* kf_err_per_mb */
+        { 0, 0 }, /* kf_frame_min_boost */
+        { 0, 0 }, /* kf_frame_max_boost_first */
+        { 0, 0 }, /* kf_frame_max_boost_subs */
+        { 0, 0 }, /* kf_max_total_boost */
+        { 0, 0 }, /* gf_max_total_boost */
+        { 0, 0 }, /* gf_frame_max_boost */
+        { 0, 0 }, /* zm_power_factor */
     } },
 };
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index ecfacfaf43..75dda0bed0 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1883,14 +1883,26 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
         VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
         { 0 },
-        { 0 },  // ss_target_bitrate
-        1,      // ts_number_layers
-        { 0 },  // ts_target_bitrate
-        { 0 },  // ts_rate_decimator
-        0,      // ts_periodicity
-        { 0 },  // ts_layer_id
-        { 0 },  // layer_taget_bitrate
-        0       // temporal_layering_mode
+        { 0 },     // ss_target_bitrate
+        1,         // ts_number_layers
+        { 0 },     // ts_target_bitrate
+        { 0 },     // ts_rate_decimator
+        0,         // ts_periodicity
+        { 0 },     // ts_layer_id
+        { 0 },     // layer_taget_bitrate
+        0,         // temporal_layering_mode
+        { 0, 0 },  // active_wq_factor
+        { 0, 0 },  // base_err_per_mb
+        { 0, 0 },  // sr_default_decay_limit
+        { 0, 0 },  // sr_diff_factor
+        { 0, 0 },  // kf_err_per_mb
+        { 0, 0 },  // kf_frame_min_boost
+        { 0, 0 },  // kf_frame_max_boost_first
+        { 0, 0 },  // kf_frame_max_boost_subs
+        { 0, 0 },  // kf_max_total_boost
+        { 0, 0 },  // gf_max_total_boost
+        { 0, 0 },  // gf_frame_max_boost
+        { 0, 0 },  // zm_power_factor
     } },
 };
 
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index da36095775..accc127f64 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -693,6 +693,90 @@ typedef struct vpx_codec_enc_cfg {
    *
    */
   int temporal_layering_mode;
+
+  /*!\brief Active worst quality factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t active_wq_factor;
+
+  /*!\brief Base error per macroblock.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t base_err_per_mb;
+
+  /*!\brief Second reference default decay limit.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t sr_default_decay_limit;
+
+  /*!\brief Second reference difference factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t sr_diff_factor;
+
+  /*!\brief Keyframe error per macroblock.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t kf_err_per_mb;
+
+  /*!\brief Keyframe minimum boost.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t kf_frame_min_boost;
+
+  /*!\brief Keyframe maximum boost, for the first keyframe in a chunk.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t kf_frame_max_boost_first;
+
+  /*!\brief Keyframe maximum boost, for subsequent keyframes.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t kf_frame_max_boost_subs;
+
+  /*!\brief Keyframe maximum total boost.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t kf_max_total_boost;
+
+  /*!\brief Golden frame maximum total boost.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t gf_max_total_boost;
+
+  /*!\brief Golden frame maximum boost.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t gf_frame_max_boost;
+
+  /*!\brief Zero motion power factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   *
+   */
+  vpx_rational_t zm_power_factor;
 } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
 
 /*!\brief  vp9 svc extra configure parameters
diff --git a/vpxenc.c b/vpxenc.c
index 5042e688c9..3c04c64b3b 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -287,6 +287,46 @@ static const arg_def_t *rc_args[] = {
   &buf_sz,           &buf_initial_sz,     &buf_optimal_sz, NULL
 };
 
+#if CONFIG_VP9_ENCODER
+static const arg_def_t active_wq_factor =
+    ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor");
+static const arg_def_t base_err_per_mb =
+    ARG_DEF(NULL, "base-err-per-mb", 1, "Base error per macroblock");
+static const arg_def_t sr_default_decay_limit = ARG_DEF(
+    NULL, "sr-default-decay-limit", 1, "Second reference default decay limit");
+static const arg_def_t sr_diff_factor =
+    ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor");
+static const arg_def_t kf_err_per_mb =
+    ARG_DEF(NULL, "kf-err-per-mb", 1, "Keyframe error per macroblock");
+static const arg_def_t kf_frame_min_boost =
+    ARG_DEF(NULL, "kf-frame-min-boost", 1, "Keyframe min boost");
+static const arg_def_t kf_frame_max_boost_first = ARG_DEF(
+    NULL, "kf-frame-max-boost-first", 1, "Max for the first keyframe boost");
+static const arg_def_t kf_frame_max_boost_subs = ARG_DEF(
+    NULL, "kf-frame-max-boost-subs", 1, "Max for subsequent keyframe boost");
+static const arg_def_t kf_max_total_boost =
+    ARG_DEF(NULL, "kf-max-total-boost", 1, "Keyframe max total boost");
+static const arg_def_t gf_max_total_boost =
+    ARG_DEF(NULL, "gf-max-total-boost", 1, "Golden frame max total boost");
+static const arg_def_t gf_frame_max_boost =
+    ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost");
+static const arg_def_t zm_power_factor =
+    ARG_DEF(NULL, "zm-power-factor", 1, "Zero motion power factor");
+static const arg_def_t *vizier_rc_args[] = { &active_wq_factor,
+                                             &base_err_per_mb,
+                                             &sr_default_decay_limit,
+                                             &sr_diff_factor,
+                                             &kf_err_per_mb,
+                                             &kf_frame_min_boost,
+                                             &kf_frame_max_boost_first,
+                                             &kf_frame_max_boost_subs,
+                                             &kf_max_total_boost,
+                                             &gf_max_total_boost,
+                                             &gf_frame_max_boost,
+                                             &zm_power_factor,
+                                             NULL };
+#endif
+
 static const arg_def_t bias_pct =
     ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
 static const arg_def_t minsection_pct =
@@ -573,6 +613,8 @@ static void show_help(FILE *fout, int shorthelp) {
 #if CONFIG_VP9_ENCODER
   fprintf(fout, "\nVP9 Specific Options:\n");
   arg_show_usage(fout, vp9_args);
+  fprintf(fout, "\nVizier Rate Control Options:\n");
+  arg_show_usage(fout, vizier_rc_args);
 #endif
   fprintf(fout,
           "\nStream timebase (--timebase):\n"
@@ -983,6 +1025,32 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9_ENCODER
+    } else if (arg_match(&arg, &active_wq_factor, argi)) {
+      config->cfg.active_wq_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &base_err_per_mb, argi)) {
+      config->cfg.base_err_per_mb = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &sr_default_decay_limit, argi)) {
+      config->cfg.sr_default_decay_limit = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &sr_diff_factor, argi)) {
+      config->cfg.sr_diff_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_err_per_mb, argi)) {
+      config->cfg.kf_err_per_mb = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_min_boost, argi)) {
+      config->cfg.kf_frame_min_boost = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_first, argi)) {
+      config->cfg.kf_frame_max_boost_first = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_subs, argi)) {
+      config->cfg.kf_frame_max_boost_subs = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_max_total_boost, argi)) {
+      config->cfg.kf_max_total_boost = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_max_total_boost, argi)) {
+      config->cfg.gf_max_total_boost = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_frame_max_boost, argi)) {
+      config->cfg.gf_frame_max_boost = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &zm_power_factor, argi)) {
+      config->cfg.zm_power_factor = arg_parse_rational(&arg);
+#endif
 #if CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
       if (strcmp(global->codec->name, "vp9") == 0) {
@@ -1177,6 +1245,9 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(kf_mode);
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
+  // Temporary use for debug
+  SHOW(active_wq_factor.num);
+  SHOW(active_wq_factor.den);
 }
 
 static void open_output_file(struct stream_state *stream,

From f32829a2e5db3cd1624e8a7a530af84c382762ef Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Tue, 16 Mar 2021 22:42:40 -0700
Subject: [PATCH 067/926] Pass vizier rc parameter values from command line to
 twopass

Change-Id: I02eabeccf2fe4604875820d38e23c2586a63e290
---
 vp9/vp9_cx_iface.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 75dda0bed0..f601ca1622 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -638,6 +638,41 @@ static vpx_codec_err_t set_encoder_config(
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t set_twopass_params_from_config(
+    const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) {
+  if (cpi == NULL) return VPX_CODEC_ERROR;
+
+  cpi->twopass.active_wq_factor =
+      (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
+  cpi->twopass.base_err_per_mb =
+      (double)cfg->base_err_per_mb.num / (double)cfg->base_err_per_mb.den;
+  cpi->twopass.sr_default_decay_limit =
+      (double)cfg->sr_default_decay_limit.num /
+      (double)cfg->sr_default_decay_limit.den;
+  cpi->twopass.sr_diff_factor =
+      (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den;
+  cpi->twopass.kf_err_per_mb =
+      (double)cfg->kf_err_per_mb.num / (double)cfg->kf_err_per_mb.den;
+  cpi->twopass.kf_frame_min_boost =
+      (double)cfg->kf_frame_min_boost.num / (double)cfg->kf_frame_min_boost.den;
+  cpi->twopass.kf_frame_max_boost_first =
+      (double)cfg->kf_frame_max_boost_first.num /
+      (double)cfg->kf_frame_max_boost_first.den;
+  cpi->twopass.kf_frame_max_boost_subs =
+      (double)cfg->kf_frame_max_boost_subs.num /
+      (double)cfg->kf_frame_max_boost_subs.den;
+  cpi->twopass.kf_max_total_boost = (int)((double)cfg->kf_max_total_boost.num /
+                                          (double)cfg->kf_max_total_boost.den);
+  cpi->twopass.gf_max_total_boost = (int)((double)cfg->gf_max_total_boost.num /
+                                          (double)cfg->gf_max_total_boost.den);
+  cpi->twopass.gf_frame_max_boost =
+      (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den;
+  cpi->twopass.zm_power_factor =
+      (double)cfg->zm_power_factor.num / (double)cfg->zm_power_factor.den;
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
@@ -664,6 +699,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
   if (res == VPX_CODEC_OK) {
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     // On profile change, request a key frame
     force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -696,6 +732,7 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
   if (res == VPX_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
   return res;
@@ -940,6 +977,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 #endif
       priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
       if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR;
+      set_twopass_params_from_config(&priv->cfg, priv->cpi);
     }
   }
 

From d3aaac367bd716b2db06e774f0a8eea7768dd184 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Wed, 31 Mar 2021 16:58:51 +0100
Subject: [PATCH 068/926] Change calculation of rd multiplier.

Change the way the rd multiplier is adjusted for Q and frame type.

Previously in VP9 the rd multiplier was adjusted based on crude Q bins
and whether the frame was a key frame or inter frame.

The Q bins create some problems as they potentially introduce
discontinuities in the RD curve. For example, rate rising with a
stepwise increase in Q instead of falling. As such, in AV1 they
have been removed.

A further issue was identified when examining the first round of
results from from the Vizier project. Here the multiplier for each Q bin
and each frame type was optimized for a training set, for various video
formats, using single point encodes at the appropriate YT rates.

These initial results appeared to show a trend for increased rd
multiplier at higher Q for key frames. This fits with intuition as in
this encoding context a higher Q indicates that a clip is harder to
encode and frames  less well predicted.  However, the situation
appeared to reverse for inter frames with higher rd multipliers
chosen at low Q.

My initial suspicion was that this was a result of over fitting, but on
closer analysis I realized that this may be more related to frame type
within the broader inter frame classification. Specifically frames coded
at low Q are predominantly ARF frames, for the mid Q bin there will
likely be a mix of ARF and normal inter frames, and for the high Q bin
the frames will almost exclusively be normal inter frames from difficult
content.

ARF frames are inherently less well predicted than other inter frames
being further apart and not having access to as many prediction modes.
We also know from previous work that ARF frames have a higher
incidence of INTRA coding and may well behave more like key frames
in this context.

This patch replaces the bin based approach with a linear function
that applies a small but smooth Q based adjustment. It also splits
ARF frames and normal inter frames into separate categories.

With this done number of parameters that will be exposed for the
next round of Vizier training is reduced from 7 to 3 (one adjustment
factor each for inter, ARF and key frames)

This patch gives net BDATE gains for our test sets even with the
baseline / default factors as follows: (% BDRATE change in overall
PSNR and SSIM, -ve is better)

LowRes 		-0.231, -0.050
ugc360p		 0.160,  -0.315
midres2		-0.348, -1.170
hdres2		-0.407, -0.691

Change-Id: I46dd2fea77b1c2849c122f10fd0df74bbd3fcc7f
---
 vp9/encoder/vp9_rd.c | 114 ++++++++++++++++++-------------------------
 vp9/encoder/vp9_rd.h |  10 ++--
 2 files changed, 50 insertions(+), 74 deletions(-)

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index d3bf3d6d4c..9efd7425c6 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -205,62 +205,36 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) {
   // Make sure this function is floating point safe.
   vpx_clear_system_state();
 
-  rdc->rd_mult_key_high_qp_fac = 1.0;  // Default: no Vizer values yet
+  rdc->rd_mult_arf_qp_fac = 1.0;  // Default: No Vizier values yet
 
+  // These hard wired estimates for the Vizier values will be removed later
+  // as the per format factors will be set on the command line.
   if (0) {
     unsigned int screen_area = (cpi->common.width * cpi->common.height);
 
     if (screen_area <= 176 * 144) {
-      rdc->rd_mult_inter_low_qp_fac = 1.018;
-      rdc->rd_mult_inter_mid_qp_fac = 0.896;
-      rdc->rd_mult_inter_high_qp_fac = 1.432;
-      rdc->rd_mult_key_ultralow_qp_fac = 1.073;
-      rdc->rd_mult_key_low_qp_fac = 1.630;
-      rdc->rd_mult_key_mid_qp_fac = 1.050;
+      rdc->rd_mult_inter_qp_fac = 0.896;
+      rdc->rd_mult_key_qp_fac = 1.050;
     } else if (screen_area <= 320 * 240) {
-      rdc->rd_mult_inter_low_qp_fac = 1.127;
-      rdc->rd_mult_inter_mid_qp_fac = 0.998;
-      rdc->rd_mult_inter_high_qp_fac = 1.463;
-      rdc->rd_mult_key_ultralow_qp_fac = 1.054;
-      rdc->rd_mult_key_low_qp_fac = 1.285;
-      rdc->rd_mult_key_mid_qp_fac = 0.952;
+      rdc->rd_mult_inter_qp_fac = 0.998;
+      rdc->rd_mult_key_qp_fac = 0.952;
     } else if (screen_area <= 640 * 360) {
-      rdc->rd_mult_inter_low_qp_fac = 1.183;
-      rdc->rd_mult_inter_mid_qp_fac = 0.959;
-      rdc->rd_mult_inter_high_qp_fac = 1.457;
-      rdc->rd_mult_key_ultralow_qp_fac = 1.144;
-      rdc->rd_mult_key_low_qp_fac = 1.734;
-      rdc->rd_mult_key_mid_qp_fac = 1.071;
+      rdc->rd_mult_inter_qp_fac = 0.959;
+      rdc->rd_mult_key_qp_fac = 1.071;
     } else if (screen_area <= 854 * 480) {
-      rdc->rd_mult_inter_low_qp_fac = 1.203;
-      rdc->rd_mult_inter_mid_qp_fac = 1.027;
-      rdc->rd_mult_inter_high_qp_fac = 1.027;
-      rdc->rd_mult_key_ultralow_qp_fac = 1.246;
-      rdc->rd_mult_key_low_qp_fac = 1.246;
-      rdc->rd_mult_key_mid_qp_fac = 1.280;
+      rdc->rd_mult_inter_qp_fac = 1.027;
+      rdc->rd_mult_key_qp_fac = 1.280;
     } else if (screen_area <= 1280 * 720) {
-      rdc->rd_mult_inter_low_qp_fac = 1.280;
-      rdc->rd_mult_inter_mid_qp_fac = 1.004;
-      rdc->rd_mult_inter_high_qp_fac = 1.470;
-      rdc->rd_mult_key_ultralow_qp_fac = 0.987;
-      rdc->rd_mult_key_low_qp_fac = 1.671;
-      rdc->rd_mult_key_mid_qp_fac = 1.193;
+      rdc->rd_mult_inter_qp_fac = 1.004;
+      rdc->rd_mult_key_qp_fac = 1.193;
     } else {
-      rdc->rd_mult_inter_low_qp_fac = 1.50;
-      rdc->rd_mult_inter_mid_qp_fac = 0.874;
-      rdc->rd_mult_inter_high_qp_fac = 1.07;
-      rdc->rd_mult_key_ultralow_qp_fac = 1.1;
-      rdc->rd_mult_key_low_qp_fac = 2.35;
-      rdc->rd_mult_key_mid_qp_fac = 0.837;
+      rdc->rd_mult_inter_qp_fac = 0.874;
+      rdc->rd_mult_key_qp_fac = 0.837;
     }
   } else {
     // For now force defaults unless testing
-    rdc->rd_mult_inter_low_qp_fac = 1.0;
-    rdc->rd_mult_inter_mid_qp_fac = 1.0;
-    rdc->rd_mult_inter_high_qp_fac = 1.0;
-    rdc->rd_mult_key_ultralow_qp_fac = 1.0;
-    rdc->rd_mult_key_low_qp_fac = 1.0;
-    rdc->rd_mult_key_mid_qp_fac = 1.0;
+    rdc->rd_mult_inter_qp_fac = 1.0;
+    rdc->rd_mult_key_qp_fac = 1.0;
   }
 }
 
@@ -273,6 +247,27 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) {
 #define KEY_MID_QP_RDM 4.5
 #define KEY_HIGH_QP_RDM 7.5
 
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+  return 4.15 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+  return 4.25 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+  return 4.35 + (0.001 * (double)qindex);
+}
+
 int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
   const RD_CONTROL *rdc = &cpi->rd_ctrl;
   const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
@@ -282,31 +277,16 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
   // Make sure this function is floating point safe.
   vpx_clear_system_state();
 
-  if (cpi->common.frame_type != KEY_FRAME) {
-    if (qindex < 128) {
-      rdmult = (int)((double)rdmult * INTER_LOW_QP_RDM *
-                     rdc->rd_mult_inter_low_qp_fac);
-    } else if (qindex < 190) {
-      rdmult = (int)((double)rdmult * INTER_MID_QP_RDM *
-                     rdc->rd_mult_inter_mid_qp_fac);
-    } else {
-      rdmult = (int)((double)rdmult * INTER_HIGH_QP_RDM *
-                     rdc->rd_mult_inter_high_qp_fac);
-    }
+  if (cpi->common.frame_type == KEY_FRAME) {
+    double def_rd_q_mult = def_kf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_key_qp_fac);
+  } else if (!cpi->rc.is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    double def_rd_q_mult = def_arf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_arf_qp_fac);
   } else {
-    if (qindex < 64) {
-      rdmult = (int)((double)rdmult * KEY_ULOW_QP_RDM *
-                     rdc->rd_mult_key_ultralow_qp_fac);
-    } else if (qindex <= 128) {
-      rdmult =
-          (int)((double)rdmult * KEY_LOW_QP_RDM * rdc->rd_mult_key_low_qp_fac);
-    } else if (qindex < 190) {
-      rdmult =
-          (int)((double)rdmult * KEY_MID_QP_RDM * rdc->rd_mult_key_mid_qp_fac);
-    } else {
-      rdmult = (int)((double)rdmult * KEY_HIGH_QP_RDM *
-                     rdc->rd_mult_key_high_qp_fac);
-    }
+    double def_rd_q_mult = def_inter_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_inter_qp_fac);
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 4899e1ae0f..d2bc5e60ed 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -103,13 +103,9 @@ typedef enum {
 
 typedef struct {
   // RD multiplier control factors added for Vizier project.
-  double rd_mult_inter_low_qp_fac;
-  double rd_mult_inter_mid_qp_fac;
-  double rd_mult_inter_high_qp_fac;
-  double rd_mult_key_ultralow_qp_fac;
-  double rd_mult_key_low_qp_fac;
-  double rd_mult_key_mid_qp_fac;
-  double rd_mult_key_high_qp_fac;
+  double rd_mult_inter_qp_fac;
+  double rd_mult_arf_qp_fac;
+  double rd_mult_key_qp_fac;
 } RD_CONTROL;
 
 typedef struct RD_OPT {

From 8b3e575a45792fe490b5bc08c3fe08f01553756b Mon Sep 17 00:00:00 2001
From: Tom Finegan <tomfinegan@google.com>
Date: Fri, 2 Apr 2021 09:40:09 -0700
Subject: [PATCH 069/926] Revert "Pass vizier rc parameter values from command
 line to twopass"

This reverts commit f32829a2e5db3cd1624e8a7a530af84c382762ef.

BUG=webm:1723

Change-Id: I866cdf288f9873c350b32091515a6d5f4df362a3
---
 vp9/vp9_cx_iface.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f601ca1622..75dda0bed0 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -638,41 +638,6 @@ static vpx_codec_err_t set_encoder_config(
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t set_twopass_params_from_config(
-    const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) {
-  if (cpi == NULL) return VPX_CODEC_ERROR;
-
-  cpi->twopass.active_wq_factor =
-      (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
-  cpi->twopass.base_err_per_mb =
-      (double)cfg->base_err_per_mb.num / (double)cfg->base_err_per_mb.den;
-  cpi->twopass.sr_default_decay_limit =
-      (double)cfg->sr_default_decay_limit.num /
-      (double)cfg->sr_default_decay_limit.den;
-  cpi->twopass.sr_diff_factor =
-      (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den;
-  cpi->twopass.kf_err_per_mb =
-      (double)cfg->kf_err_per_mb.num / (double)cfg->kf_err_per_mb.den;
-  cpi->twopass.kf_frame_min_boost =
-      (double)cfg->kf_frame_min_boost.num / (double)cfg->kf_frame_min_boost.den;
-  cpi->twopass.kf_frame_max_boost_first =
-      (double)cfg->kf_frame_max_boost_first.num /
-      (double)cfg->kf_frame_max_boost_first.den;
-  cpi->twopass.kf_frame_max_boost_subs =
-      (double)cfg->kf_frame_max_boost_subs.num /
-      (double)cfg->kf_frame_max_boost_subs.den;
-  cpi->twopass.kf_max_total_boost = (int)((double)cfg->kf_max_total_boost.num /
-                                          (double)cfg->kf_max_total_boost.den);
-  cpi->twopass.gf_max_total_boost = (int)((double)cfg->gf_max_total_boost.num /
-                                          (double)cfg->gf_max_total_boost.den);
-  cpi->twopass.gf_frame_max_boost =
-      (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den;
-  cpi->twopass.zm_power_factor =
-      (double)cfg->zm_power_factor.num / (double)cfg->zm_power_factor.den;
-
-  return VPX_CODEC_OK;
-}
-
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
@@ -699,7 +664,6 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
   if (res == VPX_CODEC_OK) {
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     // On profile change, request a key frame
     force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -732,7 +696,6 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
   if (res == VPX_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
   return res;
@@ -977,7 +940,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 #endif
       priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
       if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR;
-      set_twopass_params_from_config(&priv->cfg, priv->cpi);
     }
   }
 

From afe1ba7f3f8d270d0eb215ceeb80e89290e1a37d Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 2 Apr 2021 10:56:29 -0700
Subject: [PATCH 070/926] Pass vizier rc parameter values with range check

This is similar to the change:
https://chromium-review.googlesource.com/c/webm/libvpx/+/2771081
Which fails libvpx nightly test.

Here we add range check to get rid of the warning of
"divided by zero".

BUG=webm:1723

Change-Id: I7712efe7abd4b11cdb725643d51fd1c0a300d924
---
 vp8/vp8_cx_iface.c | 37 ++++++++++++++--------
 vp9/vp9_cx_iface.c | 76 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 89 insertions(+), 24 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 32bb1d04f2..3f09ec38a4 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -257,6 +257,19 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
     ERROR("g_threads cannot be bigger than number of token partitions");
 #endif
 
+  // The range below shall be further tuned.
+  RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_power_factor.den, 1, 1000);
+
   return VPX_CODEC_OK;
 }
 
@@ -1289,18 +1302,18 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         { 0 },    /* ts_layer_id */
         { 0 },    /* layer_target_bitrate */
         0,        /* temporal_layering_mode */
-        { 0, 0 }, /* active_wq_factor */
-        { 0, 0 }, /* base_err_per_mb */
-        { 0, 0 }, /* sr_default_decay_limit */
-        { 0, 0 }, /* sr_diff_factor */
-        { 0, 0 }, /* kf_err_per_mb */
-        { 0, 0 }, /* kf_frame_min_boost */
-        { 0, 0 }, /* kf_frame_max_boost_first */
-        { 0, 0 }, /* kf_frame_max_boost_subs */
-        { 0, 0 }, /* kf_max_total_boost */
-        { 0, 0 }, /* gf_max_total_boost */
-        { 0, 0 }, /* gf_frame_max_boost */
-        { 0, 0 }, /* zm_power_factor */
+        { 0, 1 }, /* active_wq_factor */
+        { 0, 1 }, /* base_err_per_mb */
+        { 0, 1 }, /* sr_default_decay_limit */
+        { 0, 1 }, /* sr_diff_factor */
+        { 0, 1 }, /* kf_err_per_mb */
+        { 0, 1 }, /* kf_frame_min_boost */
+        { 0, 1 }, /* kf_frame_max_boost_first */
+        { 0, 1 }, /* kf_frame_max_boost_subs */
+        { 0, 1 }, /* kf_max_total_boost */
+        { 0, 1 }, /* gf_max_total_boost */
+        { 0, 1 }, /* gf_frame_max_boost */
+        { 0, 1 }, /* zm_power_factor */
     } },
 };
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 75dda0bed0..7530850ea1 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -348,6 +348,20 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   }
   RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
   RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE);
+
+  // The range below shall be further tuned.
+  RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_power_factor.den, 1, 1000);
+
   return VPX_CODEC_OK;
 }
 
@@ -638,6 +652,41 @@ static vpx_codec_err_t set_encoder_config(
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t set_twopass_params_from_config(
+    const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) {
+  if (cpi == NULL) return VPX_CODEC_ERROR;
+
+  cpi->twopass.active_wq_factor =
+      (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
+  cpi->twopass.base_err_per_mb =
+      (double)cfg->base_err_per_mb.num / (double)cfg->base_err_per_mb.den;
+  cpi->twopass.sr_default_decay_limit =
+      (double)cfg->sr_default_decay_limit.num /
+      (double)cfg->sr_default_decay_limit.den;
+  cpi->twopass.sr_diff_factor =
+      (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den;
+  cpi->twopass.kf_err_per_mb =
+      (double)cfg->kf_err_per_mb.num / (double)cfg->kf_err_per_mb.den;
+  cpi->twopass.kf_frame_min_boost =
+      (double)cfg->kf_frame_min_boost.num / (double)cfg->kf_frame_min_boost.den;
+  cpi->twopass.kf_frame_max_boost_first =
+      (double)cfg->kf_frame_max_boost_first.num /
+      (double)cfg->kf_frame_max_boost_first.den;
+  cpi->twopass.kf_frame_max_boost_subs =
+      (double)cfg->kf_frame_max_boost_subs.num /
+      (double)cfg->kf_frame_max_boost_subs.den;
+  cpi->twopass.kf_max_total_boost = (int)((double)cfg->kf_max_total_boost.num /
+                                          (double)cfg->kf_max_total_boost.den);
+  cpi->twopass.gf_max_total_boost = (int)((double)cfg->gf_max_total_boost.num /
+                                          (double)cfg->gf_max_total_boost.den);
+  cpi->twopass.gf_frame_max_boost =
+      (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den;
+  cpi->twopass.zm_power_factor =
+      (double)cfg->zm_power_factor.num / (double)cfg->zm_power_factor.den;
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
@@ -664,6 +713,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
   if (res == VPX_CODEC_OK) {
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     // On profile change, request a key frame
     force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -696,6 +746,7 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
   if (res == VPX_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
   return res;
@@ -940,6 +991,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 #endif
       priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
       if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR;
+      set_twopass_params_from_config(&priv->cfg, priv->cpi);
     }
   }
 
@@ -1891,18 +1943,18 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         { 0 },     // ts_layer_id
         { 0 },     // layer_taget_bitrate
         0,         // temporal_layering_mode
-        { 0, 0 },  // active_wq_factor
-        { 0, 0 },  // base_err_per_mb
-        { 0, 0 },  // sr_default_decay_limit
-        { 0, 0 },  // sr_diff_factor
-        { 0, 0 },  // kf_err_per_mb
-        { 0, 0 },  // kf_frame_min_boost
-        { 0, 0 },  // kf_frame_max_boost_first
-        { 0, 0 },  // kf_frame_max_boost_subs
-        { 0, 0 },  // kf_max_total_boost
-        { 0, 0 },  // gf_max_total_boost
-        { 0, 0 },  // gf_frame_max_boost
-        { 0, 0 },  // zm_power_factor
+        { 0, 1 },  // active_wq_factor
+        { 0, 1 },  // base_err_per_mb
+        { 0, 1 },  // sr_default_decay_limit
+        { 0, 1 },  // sr_diff_factor
+        { 0, 1 },  // kf_err_per_mb
+        { 0, 1 },  // kf_frame_min_boost
+        { 0, 1 },  // kf_frame_max_boost_first
+        { 0, 1 },  // kf_frame_max_boost_subs
+        { 0, 1 },  // kf_max_total_boost
+        { 0, 1 },  // gf_max_total_boost
+        { 0, 1 },  // gf_frame_max_boost
+        { 0, 1 },  // zm_power_factor
     } },
 };
 

From 0d05ca39f25925eec32cdcea41581c974db3f623 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Tue, 6 Apr 2021 20:08:01 +0100
Subject: [PATCH 071/926] Delete unused constants.

Delete some #defines that are no longer needed.

Change-Id: I9e4e4df10716598b0d62b0c70f538d4b78a32296
---
 vp9/encoder/vp9_rd.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 9efd7425c6..87b9a691a7 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -238,15 +238,6 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) {
   }
 }
 
-// Default Rd multiplier values for Q ranges
-#define INTER_LOW_QP_RDM 4.0
-#define INTER_MID_QP_RDM 4.5
-#define INTER_HIGH_QP_RDM 3.0
-#define KEY_ULOW_QP_RDM 4.0
-#define KEY_LOW_QP_RDM 3.5
-#define KEY_MID_QP_RDM 4.5
-#define KEY_HIGH_QP_RDM 7.5
-
 // Returns the default rd multiplier for inter frames for a given qindex.
 // The function here is a first pass estimate based on data from
 // a previous Vizer run

From ab4383063c79e558ee5d8c8140d38626825ebfec Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Tue, 6 Apr 2021 20:05:48 +0100
Subject: [PATCH 072/926] Change  zm_factor for Vizier.

Changes the exposed zm_factor parameter.

This patch alters the meaning of the zm_factor
parameter that will be exposed for the Vizier project.

The previous power factor was hard to interpret in terms
of its meaning and effect and has been replaced by a linear factor.
Given that the initial Vizier results suggested a lower zero motion
effect for all formats, the default impact has been reduced.

The patch as it stands gives a modest improvement for PSNR
but is slightly down on some sets for SSIM

(overall psnr, ssim % bdrate change: -ve is better)

lowres    -0.111, 0.001
ugc360p   -0.282, -0.068
midres2   -0.183, 0.059
hdres2    -0.042, 0.172

Change-Id: Id6566433ceed8470d5fad1f30282daed56de385d
---
 vp8/vp8_cx_iface.c          |  4 ++--
 vp9/encoder/vp9_firstpass.c | 30 +++++++++++++++++++-----------
 vp9/encoder/vp9_firstpass.h |  2 +-
 vp9/vp9_cx_iface.c          |  8 ++++----
 vpx/vpx_encoder.h           |  2 +-
 vpxenc.c                    |  8 ++++----
 6 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 3f09ec38a4..1ffd81924f 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -268,7 +268,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000);
   RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000);
   RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000);
-  RANGE_CHECK(cfg, zm_power_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
 
   return VPX_CODEC_OK;
 }
@@ -1313,7 +1313,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         { 0, 1 }, /* kf_max_total_boost */
         { 0, 1 }, /* gf_max_total_boost */
         { 0, 1 }, /* gf_frame_max_boost */
-        { 0, 1 }, /* zm_power_factor */
+        { 0, 1 }, /* zm_factor */
     } },
 };
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a43099e946..f142f2611f 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -73,7 +73,7 @@
 #define MAX_KF_TOT_BOOST 5400
 #endif
 
-#define ZM_POWER_FACTOR 0.75
+#define DEFAULT_ZM_FACTOR 0.5
 #define MINQ_ADJ_LIMIT 48
 #define MINQ_ADJ_LIMIT_CQ 20
 #define HIGH_UNDERSHOOT_RATIO 2
@@ -1878,9 +1878,17 @@ static double get_zero_motion_factor(const TWO_PASS *const twopass,
 static double get_prediction_decay_rate(const TWO_PASS *const twopass,
                                         const FIRSTPASS_STATS *frame_stats) {
   const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats);
-  const double zero_motion_factor =
-      (0.95 * pow((frame_stats->pcnt_inter - frame_stats->pcnt_motion),
-                  twopass->zm_power_factor));
+  double zero_motion_factor =
+      twopass->zm_factor * DEFAULT_ZM_FACTOR *
+      (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
+
+  // Clamp value to range 0.0 to 1.0
+  // This should happen anyway if input values are sensibly clamped but checked
+  // here just in case.
+  if (zero_motion_factor > 1.0)
+    zero_motion_factor = 1.0;
+  else if (zero_motion_factor < 0.0)
+    zero_motion_factor = 0.0;
 
   return VPXMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
@@ -3501,7 +3509,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
     twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
     twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
     twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-    twopass->zm_power_factor = ZM_POWER_FACTOR;
+    twopass->zm_factor = 1.0;
   } else {
     // Vizer experimental parameters from training.
     // Later these will be set via the command line.
@@ -3517,7 +3525,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->kf_frame_max_boost_first = 25.5;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_power_factor = 2.93715229184991;
+      twopass->zm_factor = 1.0;
     } else if (screen_area <= 320 * 240) {
       twopass->active_wq_factor = 55.0;
       twopass->base_err_per_mb = 34525.33177195309;
@@ -3530,7 +3538,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->kf_frame_max_boost_first = 185.0;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_power_factor = 3.5299221493593413;
+      twopass->zm_factor = 1.0;
     } else if (screen_area <= 640 * 360) {
       twopass->active_wq_factor = 12.5;
       twopass->base_err_per_mb = 18823.978018028298;
@@ -3543,7 +3551,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->kf_frame_max_boost_first = 224.5;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_power_factor = 2.265742666649307;
+      twopass->zm_factor = 1.0;
     } else if (screen_area <= 854 * 480) {
       twopass->active_wq_factor = 51.5;
       twopass->base_err_per_mb = 33718.98307662595;
@@ -3556,7 +3564,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->kf_frame_max_boost_first = 28.0;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_power_factor = 3.552278528517416;
+      twopass->zm_factor = 1.0;
     } else if (screen_area <= 1280 * 720) {
       twopass->active_wq_factor = 41.5;
       twopass->base_err_per_mb = 29527.46375825401;
@@ -3569,7 +3577,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->kf_frame_max_boost_first = 53.0;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_power_factor = 2.568627575572356;
+      twopass->zm_factor = 1.0;
     } else {
       twopass->active_wq_factor = 31.0;
       twopass->base_err_per_mb = 34474.723463367416;
@@ -3582,7 +3590,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
       twopass->kf_frame_max_boost_first = 419.5;
       twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
       twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_power_factor = 5.5776463538431935;
+      twopass->zm_factor = 1.0;
     }
   }
 }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 624fccd428..8ec8a44748 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -234,7 +234,7 @@ typedef struct {
   int kf_max_total_boost;
   int gf_max_total_boost;
   double gf_frame_max_boost;
-  double zm_power_factor;
+  double zm_factor;
 } TWO_PASS;
 
 struct VP9_COMP;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 7530850ea1..94b1afbcc6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -360,7 +360,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000);
   RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000);
   RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000);
-  RANGE_CHECK(cfg, zm_power_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
 
   return VPX_CODEC_OK;
 }
@@ -681,8 +681,8 @@ static vpx_codec_err_t set_twopass_params_from_config(
                                           (double)cfg->gf_max_total_boost.den);
   cpi->twopass.gf_frame_max_boost =
       (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den;
-  cpi->twopass.zm_power_factor =
-      (double)cfg->zm_power_factor.num / (double)cfg->zm_power_factor.den;
+  cpi->twopass.zm_factor =
+      (double)cfg->zm_factor.num / (double)cfg->zm_factor.den;
 
   return VPX_CODEC_OK;
 }
@@ -1954,7 +1954,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         { 0, 1 },  // kf_max_total_boost
         { 0, 1 },  // gf_max_total_boost
         { 0, 1 },  // gf_frame_max_boost
-        { 0, 1 },  // zm_power_factor
+        { 0, 1 },  // zm_factor
     } },
 };
 
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index accc127f64..497051e037 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -776,7 +776,7 @@ typedef struct vpx_codec_enc_cfg {
    * Rate control parameters, set from external experiment results.
    *
    */
-  vpx_rational_t zm_power_factor;
+  vpx_rational_t zm_factor;
 } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
 
 /*!\brief  vp9 svc extra configure parameters
diff --git a/vpxenc.c b/vpxenc.c
index 3c04c64b3b..874dddb137 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -310,7 +310,7 @@ static const arg_def_t gf_max_total_boost =
     ARG_DEF(NULL, "gf-max-total-boost", 1, "Golden frame max total boost");
 static const arg_def_t gf_frame_max_boost =
     ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost");
-static const arg_def_t zm_power_factor =
+static const arg_def_t zm_factor =
     ARG_DEF(NULL, "zm-power-factor", 1, "Zero motion power factor");
 static const arg_def_t *vizier_rc_args[] = { &active_wq_factor,
                                              &base_err_per_mb,
@@ -323,7 +323,7 @@ static const arg_def_t *vizier_rc_args[] = { &active_wq_factor,
                                              &kf_max_total_boost,
                                              &gf_max_total_boost,
                                              &gf_frame_max_boost,
-                                             &zm_power_factor,
+                                             &zm_factor,
                                              NULL };
 #endif
 
@@ -1048,8 +1048,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->cfg.gf_max_total_boost = arg_parse_rational(&arg);
     } else if (arg_match(&arg, &gf_frame_max_boost, argi)) {
       config->cfg.gf_frame_max_boost = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &zm_power_factor, argi)) {
-      config->cfg.zm_power_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &zm_factor, argi)) {
+      config->cfg.zm_factor = arg_parse_rational(&arg);
 #endif
 #if CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &test16bitinternalarg, argi)) {

From 06fee5a89b7729cc178180e9c6743138faba7e79 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Tue, 6 Apr 2021 17:20:39 -0700
Subject: [PATCH 073/926] Adjust end to end psnr value

A recent change leads to slight difference of encoding results:
d3aaac367 Change calculation of rd multiplier,
which is caught by Jenkins nightly test.

Adjust the threshold to silence the test failure.

BUG=webm:1725

Change-Id: I7e8b3a26b72c831ae4d88d0fca681b354314739d
---
 test/vp9_end_to_end_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc
index 7cc126ea58..7a85db26a4 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -31,7 +31,7 @@ const double kPsnrThreshold[][5] = {
   { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
   { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
   { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 },
-  { 28.5, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 },
+  { 28.4, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 },
 };
 
 typedef struct {

From 7a5596fa78ad41c428c2e3a4196d3b391d4ac77c Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 2 Apr 2021 14:27:16 -0700
Subject: [PATCH 074/926] Fix compilation for CONFIG_RATE_CTRL

Recently, some function signatures have been changed.
This change fixes compilation error if --enable-rate-ctrl is used.

Change-Id: Ib8e9cb5e181ba1d4a6969883e377f3dd93e9289a
---
 vp9/encoder/vp9_firstpass.c | 23 ++++++++++++++---------
 vp9/encoder/vp9_firstpass.h | 12 ++++++------
 vp9/simple_encode.cc        | 14 +++++++-------
 3 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a43099e946..b3f19b6068 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2545,6 +2545,9 @@ typedef struct RANGE {
  * (The following fields will remain unchanged after initialization of encoder.)
  *   rc->static_scene_max_gf_interval
  *   rc->min_gf_interval
+ *   twopass->sr_diff_factor
+ *   twopass->sr_default_decay_limit
+ *   twopass->zm_factor
  *
  * Dynamic fields:
  * (The following fields will be updated before or after coding each frame.)
@@ -2559,10 +2562,10 @@ typedef struct RANGE {
  * structs.
  */
 static int get_gop_coding_frame_num(
-    int *use_alt_ref, const FRAME_INFO *frame_info, TWO_PASS *const twopass,
-    const RATE_CONTROL *rc, int gf_start_show_idx,
-    const RANGE *active_gf_interval, double gop_intra_factor,
-    int lag_in_frames) {
+    int *use_alt_ref, const FRAME_INFO *frame_info,
+    const TWO_PASS *const twopass, const RATE_CONTROL *rc,
+    int gf_start_show_idx, const RANGE *active_gf_interval,
+    double gop_intra_factor, int lag_in_frames) {
   const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double loop_decay_rate = 1.00;
   double mv_ratio_accumulator = 0.0;
@@ -3868,7 +3871,7 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame,
   *first_is_key_frame = 0;
   if (rc.frames_to_key == 0) {
     rc.frames_to_key = vp9_get_frames_to_next_key(
-        &cpi->oxcf, twopass, *first_show_idx, rc.min_gf_interval);
+        &cpi->oxcf, &cpi->twopass, *first_show_idx, rc.min_gf_interval);
     rc.frames_since_key = 0;
     *first_is_key_frame = 1;
   }
@@ -3879,15 +3882,15 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame,
     assert(*coding_frame_count < rc.frames_to_key);
   } else {
     *coding_frame_count = vp9_get_gop_coding_frame_count(
-        &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info, &rc,
-        *first_show_idx, multi_layer_arf, allow_alt_ref, *first_is_key_frame,
+        &cpi->oxcf, &cpi->twopass, &cpi->frame_info, &rc, *first_show_idx,
+        multi_layer_arf, allow_alt_ref, *first_is_key_frame,
         *last_gop_use_alt_ref, use_alt_ref);
   }
 }
 
 int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
+                                   const TWO_PASS *const twopass,
                                    const FRAME_INFO *frame_info,
-                                   const FIRST_PASS_INFO *first_pass_info,
                                    const RATE_CONTROL *rc, int show_idx,
                                    int multi_layer_arf, int allow_alt_ref,
                                    int first_is_key_frame,
@@ -3917,6 +3920,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
 // Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of
 // coding frames (including show frame and alt ref) can be determined.
 int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
+                             const TWO_PASS *const twopass,
                              const FRAME_INFO *frame_info,
                              const FIRST_PASS_INFO *first_pass_info,
                              int multi_layer_arf, int allow_alt_ref) {
@@ -3939,7 +3943,7 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
     }
 
     gop_coding_frame_count = vp9_get_gop_coding_frame_count(
-        oxcf, frame_info, first_pass_info, &rc, show_idx, multi_layer_arf,
+        oxcf, twopass, frame_info, &rc, show_idx, multi_layer_arf,
         allow_alt_ref, first_is_key_frame, last_gop_use_alt_ref, &use_alt_ref);
 
     rc.source_alt_ref_active = use_alt_ref;
@@ -3955,6 +3959,7 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
 
 void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
                            const TWO_PASS *const twopass, int *key_frame_map) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int show_idx = 0;
   RATE_CONTROL rc;
   vp9_rc_init(oxcf, 1, &rc);
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 624fccd428..7586ce868a 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -278,8 +278,8 @@ void vp9_get_next_group_of_picture(const struct VP9_COMP *cpi,
 /*!\brief Call this function before coding a new group of pictures to get
  * information about it.
  * \param[in] oxcf                 Encoder config
+ * \param[in] twopass              Twopass info
  * \param[in] frame_info           Frame info
- * \param[in] first_pass_info      First pass stats
  * \param[in] rc                   Rate control state
  * \param[in] show_idx             Show index of the first frame in the group
  * \param[in] multi_layer_arf      Is multi-layer alternate reference used
@@ -292,26 +292,26 @@ void vp9_get_next_group_of_picture(const struct VP9_COMP *cpi,
  * \return Returns coding frame count
  */
 int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf,
+                                   const TWO_PASS *const twopass,
                                    const FRAME_INFO *frame_info,
-                                   const FIRST_PASS_INFO *first_pass_info,
                                    const RATE_CONTROL *rc, int show_idx,
                                    int multi_layer_arf, int allow_alt_ref,
                                    int first_is_key_frame,
                                    int last_gop_use_alt_ref, int *use_alt_ref);
 
 int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf,
+                             const TWO_PASS *const twopass,
                              const FRAME_INFO *frame_info,
                              const FIRST_PASS_INFO *first_pass_info,
                              int multi_layer_arf, int allow_alt_ref);
 
 /*!\brief Compute a key frame binary map indicates whether key frames appear
  * in the corresponding positions. The passed in key_frame_map must point to an
- * integer array with length equal to first_pass_info->num_frames, which is the
- * number of show frames in the video.
+ * integer array with length equal to twopass->first_pass_info.num_frames,
+ * which is the number of show frames in the video.
  */
 void vp9_get_key_frame_map(const struct VP9EncoderConfig *oxcf,
-                           const FIRST_PASS_INFO *first_pass_info,
-                           int *key_frame_map);
+                           const TWO_PASS *const twopass, int *key_frame_map);
 #endif  // CONFIG_RATE_CTRL
 
 FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass);
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index d4eb0c669d..efdc71eb98 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -1084,8 +1084,7 @@ void SimpleEncode::UpdateKeyFrameGroup(int key_frame_show_index) {
   const VP9_COMP *cpi = impl_ptr_->cpi;
   key_frame_group_index_ = 0;
   key_frame_group_size_ = vp9_get_frames_to_next_key(
-      &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info,
-      key_frame_show_index, cpi->rc.min_gf_interval);
+      &cpi->oxcf, &cpi->twopass, key_frame_show_index, cpi->rc.min_gf_interval);
   assert(key_frame_group_size_ > 0);
   // Init the reference frame info when a new key frame group appears.
   InitRefFrameInfo(&ref_frame_info_);
@@ -1250,6 +1249,7 @@ int SimpleEncode::GetCodingFrameNum() const {
   }
 
   // These are the default settings for now.
+  const VP9_COMP *cpi = impl_ptr_->cpi;
   const int multi_layer_arf = 0;
   const int allow_alt_ref = 1;
   vpx_rational_t frame_rate =
@@ -1262,11 +1262,13 @@ int SimpleEncode::GetCodingFrameNum() const {
   fps_init_first_pass_info(&first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
                            num_frames_);
-  return vp9_get_coding_frame_num(&oxcf, &frame_info, &first_pass_info,
-                                  multi_layer_arf, allow_alt_ref);
+  return vp9_get_coding_frame_num(&oxcf, &cpi->twopass, &frame_info,
+                                  &first_pass_info, multi_layer_arf,
+                                  allow_alt_ref);
 }
 
 std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
+  const VP9_COMP *cpi = impl_ptr_->cpi;
   // The last entry of first_pass_stats is the overall stats.
   assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1);
   vpx_rational_t frame_rate =
@@ -1274,14 +1276,12 @@ std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
-  FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
   FIRST_PASS_INFO first_pass_info;
   fps_init_first_pass_info(&first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
                            num_frames_);
   std::vector<int> key_frame_map(num_frames_, 0);
-  vp9_get_key_frame_map(&oxcf, &frame_info, &first_pass_info,
-                        GetVectorData(key_frame_map));
+  vp9_get_key_frame_map(&oxcf, &cpi->twopass, GetVectorData(key_frame_map));
   return key_frame_map;
 }
 

From 1c792f2991e632be26414ecb62835bf148c086b4 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 8 Apr 2021 17:34:16 -0700
Subject: [PATCH 075/926] vpx_image: clear user provided vpx_image_t early

this avoids uninitialized values and potential misuse of them which
could lead to a crash should the function fail

this is the same fix that was applied in libaom:
d0cac70b5 Fix a free on invalid ptr when img allocation fails

Bug: webm:1722
Change-Id: If7a8d08c4b010f12e2e1d848613c0fa7328f1f9c
---
 vpx/src/vpx_image.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index ff496b5d34..2b7411f94f 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -24,6 +24,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
   unsigned int stride_in_bytes;
   int align;
 
+  if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
+
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
 
@@ -88,8 +90,6 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
     if (!img) goto fail;
 
     img->self_allocd = 1;
-  } else {
-    memset(img, 0, sizeof(vpx_image_t));
   }
 
   img->img_data = img_data;

From c77a7f60040ff63c8dd778cfdd322cd6d7f62e17 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Mon, 12 Apr 2021 13:51:44 +0100
Subject: [PATCH 076/926] Removed unused constant

Deleted #define that is no longer referenced.

Change-Id: If0b132c5a40dd8910f535fffdee7d2d1c7df4748
---
 vp9/encoder/vp9_firstpass.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index f142f2611f..61ad3b6524 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -54,7 +54,6 @@
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
-#define SR_DIFF_PART 0.0015
 #define INTRA_PART 0.005
 #define DEFAULT_DECAY_LIMIT 0.75
 #define LOW_SR_DIFF_TRHESH 0.1

From 1b07aae9e47a5480f78025dc3434331faaa68e33 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Tue, 13 Apr 2021 11:59:40 -0700
Subject: [PATCH 077/926] Set vizier rc parameters

If pass --use-vizier-rc-params=1, the rc parameters are overwittern
by pass in values. It --use-vizier-rc-params=0, the rc parameters
remain the default values.

Change-Id: I7a3e806e0918f49e8970997379a6e99af6bb7cac
---
 vp8/vp8_cx_iface.c          |   2 +
 vp9/encoder/vp9_firstpass.c | 125 +++++++-----------------------------
 vp9/encoder/vp9_firstpass.h |   4 ++
 vp9/vp9_cx_iface.c          |   4 ++
 vpx/vpx_encoder.h           |  30 +++++++++
 vpxenc.c                    |  10 ++-
 6 files changed, 72 insertions(+), 103 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 1ffd81924f..64d01e535e 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -258,6 +258,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
 #endif
 
   // The range below shall be further tuned.
+  RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
   RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
   RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000);
   RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
@@ -1302,6 +1303,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         { 0 },    /* ts_layer_id */
         { 0 },    /* layer_target_bitrate */
         0,        /* temporal_layering_mode */
+        0,        /* use_vizier_rc_params */
         { 0, 1 }, /* active_wq_factor */
         { 0, 1 }, /* base_err_per_mb */
         { 0, 1 }, /* sr_default_decay_limit */
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 07d8eddd0b..9b3d7425e2 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3493,109 +3493,32 @@ static int is_skippable_frame(const VP9_COMP *cpi) {
 // Configure image size specific vizier parameters.
 // Later these will be set via additional command line options
 static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
-  if (1) {
-    // Force defaults for now
-    twopass->active_wq_factor = AV_WQ_FACTOR;
-    twopass->base_err_per_mb = BASELINE_ERR_PER_MB;
-    twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
-    twopass->sr_diff_factor = 1.0;
-    twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
-    twopass->gf_max_total_boost = MAX_GF_BOOST;
-    if (screen_area < 1280 * 720) {
-      twopass->kf_err_per_mb = 2000.0;
-    } else if (screen_area < 1920 * 1080) {
-      twopass->kf_err_per_mb = 500.0;
-    } else {
-      twopass->kf_err_per_mb = 250.0;
-    }
-    twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-    twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-    twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-    twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-    twopass->zm_factor = 1.0;
+  // When |use_vizier_rc_params| is 1, we expect the rc parameters have been
+  // initialized by the pass in values.
+  // Be careful that parameters below are only initialized to 0, if we do not
+  // pass values to them. It is desired to take care of each parameter when
+  // using |use_vizier_rc_params|.
+  if (twopass->use_vizier_rc_params) return;
+
+  // When |use_vizier_rc_params| is 0, use defaults for now.
+  twopass->active_wq_factor = AV_WQ_FACTOR;
+  twopass->base_err_per_mb = BASELINE_ERR_PER_MB;
+  twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+  twopass->sr_diff_factor = 1.0;
+  twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+  twopass->gf_max_total_boost = MAX_GF_BOOST;
+  if (screen_area < 1280 * 720) {
+    twopass->kf_err_per_mb = 2000.0;
+  } else if (screen_area < 1920 * 1080) {
+    twopass->kf_err_per_mb = 500.0;
   } else {
-    // Vizer experimental parameters from training.
-    // Later these will be set via the command line.
-    if (screen_area <= 176 * 144) {
-      twopass->active_wq_factor = 46.0;
-      twopass->base_err_per_mb = 37597.399760969536;
-      twopass->sr_default_decay_limit = 0.3905639800962774;
-      twopass->sr_diff_factor = 6.4;
-      twopass->gf_frame_max_boost = 87.27362648627846;
-      twopass->gf_max_total_boost = MAX_GF_BOOST;
-      twopass->kf_err_per_mb = 1854.8255436877148;
-      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = 25.5;
-      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_factor = 1.0;
-    } else if (screen_area <= 320 * 240) {
-      twopass->active_wq_factor = 55.0;
-      twopass->base_err_per_mb = 34525.33177195309;
-      twopass->sr_default_decay_limit = 0.23901360046804604;
-      twopass->sr_diff_factor = 5.73;
-      twopass->gf_frame_max_boost = 127.34978204980285;
-      twopass->gf_max_total_boost = MAX_GF_BOOST;
-      twopass->kf_err_per_mb = 723.8337508755031;
-      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = 185.0;
-      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_factor = 1.0;
-    } else if (screen_area <= 640 * 360) {
-      twopass->active_wq_factor = 12.5;
-      twopass->base_err_per_mb = 18823.978018028298;
-      twopass->sr_default_decay_limit = 0.6043527690301296;
-      twopass->sr_diff_factor = 2.28;
-      twopass->gf_frame_max_boost = 75.17672317013668;
-      twopass->gf_max_total_boost = MAX_GF_BOOST;
-      twopass->kf_err_per_mb = 422.2871502380377;
-      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = 224.5;
-      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_factor = 1.0;
-    } else if (screen_area <= 854 * 480) {
-      twopass->active_wq_factor = 51.5;
-      twopass->base_err_per_mb = 33718.98307662595;
-      twopass->sr_default_decay_limit = 0.33633414970713393;
-      twopass->sr_diff_factor = 5.8;
-      twopass->gf_frame_max_boost = 85.2868528581522;
-      twopass->gf_max_total_boost = MAX_GF_BOOST;
-      twopass->kf_err_per_mb = 1513.4883914008383;
-      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = 28.0;
-      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_factor = 1.0;
-    } else if (screen_area <= 1280 * 720) {
-      twopass->active_wq_factor = 41.5;
-      twopass->base_err_per_mb = 29527.46375825401;
-      twopass->sr_default_decay_limit = 0.5009117586299728;
-      twopass->sr_diff_factor = 3.33;
-      twopass->gf_frame_max_boost = 81.00472969483079;
-      twopass->gf_max_total_boost = MAX_GF_BOOST;
-      twopass->kf_err_per_mb = 998.6342911785146;
-      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = 53.0;
-      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_factor = 1.0;
-    } else {
-      twopass->active_wq_factor = 31.0;
-      twopass->base_err_per_mb = 34474.723463367416;
-      twopass->sr_default_decay_limit = 0.23346886902707745;
-      twopass->sr_diff_factor = 7.6;
-      twopass->gf_frame_max_boost = 213.2940230360479;
-      twopass->gf_max_total_boost = MAX_GF_BOOST;
-      twopass->kf_err_per_mb = 35931.25734431429;
-      twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-      twopass->kf_frame_max_boost_first = 419.5;
-      twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-      twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-      twopass->zm_factor = 1.0;
-    }
+    twopass->kf_err_per_mb = 250.0;
   }
+  twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
+  twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+  twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
+  twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+  twopass->zm_factor = 1.0;
 }
 
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 441859a342..1418b67a48 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -223,6 +223,10 @@ typedef struct {
   GF_GROUP gf_group;
 
   // Vizeir project experimental two pass rate control parameters.
+  // When |use_vizier_rc_params| is 1, the following parameters will
+  // be overwritten by pass in values. Otherwise, they are initialized
+  // by default values.
+  int use_vizier_rc_params;
   double active_wq_factor;
   double base_err_per_mb;
   double sr_default_decay_limit;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 94b1afbcc6..e35b6f1e24 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -350,6 +350,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE);
 
   // The range below shall be further tuned.
+  RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
   RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
   RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000);
   RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
@@ -654,8 +655,10 @@ static vpx_codec_err_t set_encoder_config(
 
 static vpx_codec_err_t set_twopass_params_from_config(
     const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) {
+  if (!cfg->use_vizier_rc_params) return VPX_CODEC_OK;
   if (cpi == NULL) return VPX_CODEC_ERROR;
 
+  cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params;
   cpi->twopass.active_wq_factor =
       (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
   cpi->twopass.base_err_per_mb =
@@ -1943,6 +1946,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         { 0 },     // ts_layer_id
         { 0 },     // layer_taget_bitrate
         0,         // temporal_layering_mode
+        0,         // use_vizier_rc_params
         { 0, 1 },  // active_wq_factor
         { 0, 1 },  // base_err_per_mb
         { 0, 1 },  // sr_default_decay_limit
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 497051e037..3c9304b37a 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -694,9 +694,17 @@ typedef struct vpx_codec_enc_cfg {
    */
   int temporal_layering_mode;
 
+  /*!\brief A flag indicating whether to use external rate control parameters.
+   * By default is 0. If set to 1, the following parameters will be used in the
+   * rate control system.
+   */
+  int use_vizier_rc_params;
+
   /*!\brief Active worst quality factor.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t active_wq_factor;
@@ -704,6 +712,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Base error per macroblock.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t base_err_per_mb;
@@ -711,6 +721,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Second reference default decay limit.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t sr_default_decay_limit;
@@ -718,6 +730,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Second reference difference factor.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t sr_diff_factor;
@@ -725,6 +739,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Keyframe error per macroblock.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t kf_err_per_mb;
@@ -732,6 +748,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Keyframe minimum boost.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t kf_frame_min_boost;
@@ -739,6 +757,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Keyframe maximum boost, for the first keyframe in a chunk.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t kf_frame_max_boost_first;
@@ -746,6 +766,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Keyframe maximum boost, for subsequent keyframes.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t kf_frame_max_boost_subs;
@@ -753,6 +775,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Keyframe maximum total boost.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t kf_max_total_boost;
@@ -760,6 +784,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Golden frame maximum total boost.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t gf_max_total_boost;
@@ -767,6 +793,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Golden frame maximum boost.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t gf_frame_max_boost;
@@ -774,6 +802,8 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Zero motion power factor.
    *
    * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
    *
    */
   vpx_rational_t zm_factor;
diff --git a/vpxenc.c b/vpxenc.c
index 874dddb137..c9e386a06b 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -288,6 +288,8 @@ static const arg_def_t *rc_args[] = {
 };
 
 #if CONFIG_VP9_ENCODER
+static const arg_def_t use_vizier_rc_params =
+    ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params");
 static const arg_def_t active_wq_factor =
     ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor");
 static const arg_def_t base_err_per_mb =
@@ -311,8 +313,9 @@ static const arg_def_t gf_max_total_boost =
 static const arg_def_t gf_frame_max_boost =
     ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost");
 static const arg_def_t zm_factor =
-    ARG_DEF(NULL, "zm-power-factor", 1, "Zero motion power factor");
-static const arg_def_t *vizier_rc_args[] = { &active_wq_factor,
+    ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor");
+static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params,
+                                             &active_wq_factor,
                                              &base_err_per_mb,
                                              &sr_default_decay_limit,
                                              &sr_diff_factor,
@@ -1026,6 +1029,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
 #if CONFIG_VP9_ENCODER
+    } else if (arg_match(&arg, &use_vizier_rc_params, argi)) {
+      config->cfg.use_vizier_rc_params = arg_parse_int(&arg);
     } else if (arg_match(&arg, &active_wq_factor, argi)) {
       config->cfg.active_wq_factor = arg_parse_rational(&arg);
     } else if (arg_match(&arg, &base_err_per_mb, argi)) {
@@ -1246,6 +1251,7 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
   // Temporary use for debug
+  SHOW(use_vizier_rc_params);
   SHOW(active_wq_factor.num);
   SHOW(active_wq_factor.den);
 }

From 665cccfd6ccbc1be2db7d550b68388679b573410 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Wed, 14 Apr 2021 22:15:54 -0700
Subject: [PATCH 078/926] Pass vizier rd parameter values

Add command line options for three rd parameters.
They are controlled by --use_vizier_rc_params, together with
other rc parameters.
If not set from command line, current default values will be used.

Change-Id: Ie1b9a98a50326551cc1d5940c4b637cb01a61aa0
---
 vp8/vp8_cx_iface.c   |  6 ++++++
 vp9/encoder/vp9_rd.c | 41 ++++++++++-------------------------------
 vp9/vp9_cx_iface.c   | 12 ++++++++++++
 vpx/vpx_encoder.h    | 30 ++++++++++++++++++++++++++++++
 vpxenc.c             | 15 +++++++++++++++
 5 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 64d01e535e..872710f138 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -270,6 +270,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000);
   RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000);
   RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000);
 
   return VPX_CODEC_OK;
 }
@@ -1316,6 +1319,9 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         { 0, 1 }, /* gf_max_total_boost */
         { 0, 1 }, /* gf_frame_max_boost */
         { 0, 1 }, /* zm_factor */
+        { 1, 1 }, /* rd_mult_inter_qp_fac */
+        { 1, 1 }, /* rd_mult_arf_qp_fac */
+        { 1, 1 }, /* rd_mult_key_qp_fac */
     } },
 };
 
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 87b9a691a7..9fa3ff1865 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -202,40 +202,19 @@ static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
 void vp9_init_rd_parameters(VP9_COMP *cpi) {
   RD_CONTROL *const rdc = &cpi->rd_ctrl;
 
+  // When |use_vizier_rc_params| is 1, we expect the rd parameters have been
+  // initialized by the pass in values.
+  // Be careful that parameters below are only initialized to 1, if we do not
+  // pass values to them. It is desired to take care of each parameter when
+  // using |use_vizier_rc_params|.
+  if (cpi->twopass.use_vizier_rc_params) return;
+
   // Make sure this function is floating point safe.
   vpx_clear_system_state();
 
-  rdc->rd_mult_arf_qp_fac = 1.0;  // Default: No Vizier values yet
-
-  // These hard wired estimates for the Vizier values will be removed later
-  // as the per format factors will be set on the command line.
-  if (0) {
-    unsigned int screen_area = (cpi->common.width * cpi->common.height);
-
-    if (screen_area <= 176 * 144) {
-      rdc->rd_mult_inter_qp_fac = 0.896;
-      rdc->rd_mult_key_qp_fac = 1.050;
-    } else if (screen_area <= 320 * 240) {
-      rdc->rd_mult_inter_qp_fac = 0.998;
-      rdc->rd_mult_key_qp_fac = 0.952;
-    } else if (screen_area <= 640 * 360) {
-      rdc->rd_mult_inter_qp_fac = 0.959;
-      rdc->rd_mult_key_qp_fac = 1.071;
-    } else if (screen_area <= 854 * 480) {
-      rdc->rd_mult_inter_qp_fac = 1.027;
-      rdc->rd_mult_key_qp_fac = 1.280;
-    } else if (screen_area <= 1280 * 720) {
-      rdc->rd_mult_inter_qp_fac = 1.004;
-      rdc->rd_mult_key_qp_fac = 1.193;
-    } else {
-      rdc->rd_mult_inter_qp_fac = 0.874;
-      rdc->rd_mult_key_qp_fac = 0.837;
-    }
-  } else {
-    // For now force defaults unless testing
-    rdc->rd_mult_inter_qp_fac = 1.0;
-    rdc->rd_mult_key_qp_fac = 1.0;
-  }
+  rdc->rd_mult_inter_qp_fac = 1.0;
+  rdc->rd_mult_arf_qp_fac = 1.0;
+  rdc->rd_mult_key_qp_fac = 1.0;
 }
 
 // Returns the default rd multiplier for inter frames for a given qindex.
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index e35b6f1e24..c2ca215f10 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -362,6 +362,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000);
   RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000);
   RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000);
 
   return VPX_CODEC_OK;
 }
@@ -686,6 +689,12 @@ static vpx_codec_err_t set_twopass_params_from_config(
       (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den;
   cpi->twopass.zm_factor =
       (double)cfg->zm_factor.num / (double)cfg->zm_factor.den;
+  cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num /
+                                      (double)cfg->rd_mult_inter_qp_fac.den;
+  cpi->rd_ctrl.rd_mult_arf_qp_fac =
+      (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den;
+  cpi->rd_ctrl.rd_mult_key_qp_fac =
+      (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den;
 
   return VPX_CODEC_OK;
 }
@@ -1959,6 +1968,9 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         { 0, 1 },  // gf_max_total_boost
         { 0, 1 },  // gf_frame_max_boost
         { 0, 1 },  // zm_factor
+        { 1, 1 },  // rd_mult_inter_qp_fac
+        { 1, 1 },  // rd_mult_arf_qp_fac
+        { 1, 1 },  // rd_mult_key_qp_fac
     } },
 };
 
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 3c9304b37a..255cb693ee 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -807,6 +807,36 @@ typedef struct vpx_codec_enc_cfg {
    *
    */
   vpx_rational_t zm_factor;
+
+  /*!\brief Rate-distortion multiplier for inter frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_inter_qp_fac;
+
+  /*!\brief Rate-distortion multiplier for alt-ref frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_arf_qp_fac;
+
+  /*!\brief Rate-distortion multiplier for key frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_key_qp_fac;
 } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
 
 /*!\brief  vp9 svc extra configure parameters
diff --git a/vpxenc.c b/vpxenc.c
index c9e386a06b..1a2b4a9214 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -314,6 +314,12 @@ static const arg_def_t gf_frame_max_boost =
     ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost");
 static const arg_def_t zm_factor =
     ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor");
+static const arg_def_t rd_mult_inter_qp_fac =
+    ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1, "RD multiplier for inter frames");
+static const arg_def_t rd_mult_arf_qp_fac =
+    ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1, "RD multiplier for alt-ref frames");
+static const arg_def_t rd_mult_key_qp_fac =
+    ARG_DEF(NULL, "rd-mult-key-qp-fac", 1, "RD multiplier for key frames");
 static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params,
                                              &active_wq_factor,
                                              &base_err_per_mb,
@@ -327,6 +333,9 @@ static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params,
                                              &gf_max_total_boost,
                                              &gf_frame_max_boost,
                                              &zm_factor,
+                                             &rd_mult_inter_qp_fac,
+                                             &rd_mult_arf_qp_fac,
+                                             &rd_mult_key_qp_fac,
                                              NULL };
 #endif
 
@@ -1055,6 +1064,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->cfg.gf_frame_max_boost = arg_parse_rational(&arg);
     } else if (arg_match(&arg, &zm_factor, argi)) {
       config->cfg.zm_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) {
+      config->cfg.rd_mult_inter_qp_fac = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_arf_qp_fac, argi)) {
+      config->cfg.rd_mult_arf_qp_fac = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_key_qp_fac, argi)) {
+      config->cfg.rd_mult_key_qp_fac = arg_parse_rational(&arg);
 #endif
 #if CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &test16bitinternalarg, argi)) {

From c911c2d9c51698548c4529a4c35fb495ea95c435 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Tue, 20 Apr 2021 17:26:22 +0100
Subject: [PATCH 079/926] Further normalization of Vizier parameters.

Further changes to normalize the Vizier command line parameters.
The intent is that the default behavior for any given parameter
is signaled by the value 1.0 (expressed on the command line as a
rational).

The final values used in the two pass code are obtained by multiplying
the passed in factors by a default values if use_vizier_rc_params is 1.
Where  use_vizier_rc_params is 0 the values are explicitly set to
the defaults.

This patch also changes the default value of each parameter to 1.0
even if not set explicitly. This should ensure safe /default behavior
if the user sets use_vizier_rc_params to 1 but does not set all the
the individual parameters.

Change-Id: Ied08b3c22df18f42f446a4cc9363473cad097f69
---
 vp8/vp8_cx_iface.c          | 38 +++++++--------
 vp9/encoder/vp9_firstpass.c | 85 +++++++++++++++++++++------------
 vp9/encoder/vp9_firstpass.h |  6 +--
 vp9/vp9_cx_iface.c          | 74 +++++++++++++++--------------
 vpx/vpx_encoder.h           | 33 ++++++-------
 vpxenc.c                    | 94 ++++++++++++++++++++-----------------
 6 files changed, 182 insertions(+), 148 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 872710f138..6e12e38989 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -260,15 +260,15 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   // The range below shall be further tuned.
   RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
   RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
-  RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000);
+  RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000);
   RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
   RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
-  RANGE_CHECK(cfg, kf_err_per_mb.den, 1, 1000);
-  RANGE_CHECK(cfg, kf_frame_min_boost.den, 1, 1000);
-  RANGE_CHECK(cfg, kf_frame_max_boost_subs.den, 1, 1000);
-  RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000);
-  RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000);
-  RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000);
   RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
   RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
   RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
@@ -1307,18 +1307,18 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         { 0 },    /* layer_target_bitrate */
         0,        /* temporal_layering_mode */
         0,        /* use_vizier_rc_params */
-        { 0, 1 }, /* active_wq_factor */
-        { 0, 1 }, /* base_err_per_mb */
-        { 0, 1 }, /* sr_default_decay_limit */
-        { 0, 1 }, /* sr_diff_factor */
-        { 0, 1 }, /* kf_err_per_mb */
-        { 0, 1 }, /* kf_frame_min_boost */
-        { 0, 1 }, /* kf_frame_max_boost_first */
-        { 0, 1 }, /* kf_frame_max_boost_subs */
-        { 0, 1 }, /* kf_max_total_boost */
-        { 0, 1 }, /* gf_max_total_boost */
-        { 0, 1 }, /* gf_frame_max_boost */
-        { 0, 1 }, /* zm_factor */
+        { 1, 1 }, /* active_wq_factor */
+        { 1, 1 }, /* err_per_mb_factor */
+        { 1, 1 }, /* sr_default_decay_limit */
+        { 1, 1 }, /* sr_diff_factor */
+        { 1, 1 }, /* kf_err_per_mb_factor */
+        { 1, 1 }, /* kf_frame_min_boost_factor */
+        { 1, 1 }, /* kf_frame_max_boost_first_factor */
+        { 1, 1 }, /* kf_frame_max_boost_subs_factor */
+        { 1, 1 }, /* kf_max_total_boost_factor */
+        { 1, 1 }, /* gf_max_total_boost_factor */
+        { 1, 1 }, /* gf_frame_max_boost_factor */
+        { 1, 1 }, /* zm_factor */
         { 1, 1 }, /* rd_mult_inter_qp_fac */
         { 1, 1 }, /* rd_mult_arf_qp_fac */
         { 1, 1 }, /* rd_mult_key_qp_fac */
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 76ef11909c..a4717ad036 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1878,8 +1878,7 @@ static double get_prediction_decay_rate(const TWO_PASS *const twopass,
                                         const FIRSTPASS_STATS *frame_stats) {
   const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats);
   double zero_motion_factor =
-      twopass->zm_factor * DEFAULT_ZM_FACTOR *
-      (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
+      twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
 
   // Clamp value to range 0.0 to 1.0
   // This should happen anyway if input values are sensibly clamped but checked
@@ -1981,7 +1980,7 @@ static double calc_frame_boost(const FRAME_INFO *frame_info,
   const double active_area = calculate_active_area(frame_info, this_frame);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = (twopass->base_err_per_mb * active_area) /
+  frame_boost = (twopass->err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
 
   // Small adjustment for cases where there is a zoom out
@@ -2027,7 +2026,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
   // This value is in line with the minimum per frame boost in the alt_ref
   // boost calculation.
   frame_boost =
-      ((frame_boost + twopass->kf_frame_min_boost) * boost_q_correction);
+      (frame_boost + twopass->kf_frame_min_boost) * boost_q_correction;
 
   // Maximum allowed boost this frame. May be different for first vs subsequent
   // key frames.
@@ -2861,7 +2860,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     const int arf_boost =
         compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames,
                           avg_inter_frame_qindex);
-    rc->gfu_boost = VPXMIN(twopass->gf_max_total_boost, arf_boost);
+    rc->gfu_boost = VPXMIN((int)twopass->gf_max_total_boost, arf_boost);
     rc->source_alt_ref_pending = 0;
   }
 
@@ -3429,12 +3428,12 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   // Special case for static / slide show content but dont apply
   // if the kf group is very short.
   if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
-    rc->kf_boost = twopass->kf_max_total_boost;
+    rc->kf_boost = (int)(twopass->kf_max_total_boost);
   } else {
     // Apply various clamps for min and max oost
     rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
     rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-    rc->kf_boost = VPXMIN(rc->kf_boost, twopass->kf_max_total_boost);
+    rc->kf_boost = VPXMIN(rc->kf_boost, (int)(twopass->kf_max_total_boost));
   }
 
   // Work out how many bits to allocate for the key frame itself.
@@ -3492,32 +3491,56 @@ static int is_skippable_frame(const VP9_COMP *cpi) {
 // Configure image size specific vizier parameters.
 // Later these will be set via additional command line options
 static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
-  // When |use_vizier_rc_params| is 1, we expect the rc parameters have been
-  // initialized by the pass in values.
-  // Be careful that parameters below are only initialized to 0, if we do not
-  // pass values to them. It is desired to take care of each parameter when
-  // using |use_vizier_rc_params|.
-  if (twopass->use_vizier_rc_params) return;
-
-  // When |use_vizier_rc_params| is 0, use defaults for now.
-  twopass->active_wq_factor = AV_WQ_FACTOR;
-  twopass->base_err_per_mb = BASELINE_ERR_PER_MB;
-  twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
-  twopass->sr_diff_factor = 1.0;
-  twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
-  twopass->gf_max_total_boost = MAX_GF_BOOST;
-  if (screen_area < 1280 * 720) {
-    twopass->kf_err_per_mb = 2000.0;
-  } else if (screen_area < 1920 * 1080) {
-    twopass->kf_err_per_mb = 500.0;
+  // When |use_vizier_rc_params| is 1, we expect the rc parameters below to
+  // have been initialised on the command line as adjustment factors such
+  // that a factor of 1.0 will match the default behavior when
+  // |use_vizier_rc_params| is 0
+  if (twopass->use_vizier_rc_params) {
+    twopass->active_wq_factor *= AV_WQ_FACTOR;
+    twopass->err_per_mb *= BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT;
+    twopass->sr_diff_factor *= 1.0;
+    twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost *= MAX_GF_BOOST;
+    twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST;
+    twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST;
+    twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST;
+    twopass->zm_factor *= DEFAULT_ZM_FACTOR;
+
+    // Correction for the fact that the kf_err_per_mb_factor default is
+    // already different for different video formats and ensures that a passed
+    // in value of 1.0 on the vizier command line will still match the current
+    // default.
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb *= 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb *= 500.0;
+    } else {
+      twopass->kf_err_per_mb *= 250.0;
+    }
   } else {
-    twopass->kf_err_per_mb = 250.0;
+    // When |use_vizier_rc_params| is 0, use defaults.
+    twopass->active_wq_factor = AV_WQ_FACTOR;
+    twopass->err_per_mb = BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+    twopass->sr_diff_factor = 1.0;
+    twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost = MAX_GF_BOOST;
+    twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
+    twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST;
+    twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+    twopass->zm_factor = DEFAULT_ZM_FACTOR;
+
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb = 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb = 500.0;
+    } else {
+      twopass->kf_err_per_mb = 250.0;
+    }
   }
-  twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
-  twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
-  twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first;
-  twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
-  twopass->zm_factor = 1.0;
 }
 
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 1418b67a48..cdbcb52412 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -228,15 +228,15 @@ typedef struct {
   // by default values.
   int use_vizier_rc_params;
   double active_wq_factor;
-  double base_err_per_mb;
+  double err_per_mb;
   double sr_default_decay_limit;
   double sr_diff_factor;
   double kf_err_per_mb;
   double kf_frame_min_boost;
   double kf_frame_max_boost_first;  // Max for first kf in a chunk.
   double kf_frame_max_boost_subs;   // Max for subsequent mid chunk kfs.
-  int kf_max_total_boost;
-  int gf_max_total_boost;
+  double kf_max_total_boost;
+  double gf_max_total_boost;
   double gf_frame_max_boost;
   double zm_factor;
 } TWO_PASS;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index c2ca215f10..c700620ef3 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -352,15 +352,15 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   // The range below shall be further tuned.
   RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
   RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
-  RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000);
+  RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000);
   RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
   RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
-  RANGE_CHECK(cfg, kf_err_per_mb.den, 1, 1000);
-  RANGE_CHECK(cfg, kf_frame_min_boost.den, 1, 1000);
-  RANGE_CHECK(cfg, kf_frame_max_boost_subs.den, 1, 1000);
-  RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000);
-  RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000);
-  RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000);
   RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
   RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
   RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
@@ -662,31 +662,35 @@ static vpx_codec_err_t set_twopass_params_from_config(
   if (cpi == NULL) return VPX_CODEC_ERROR;
 
   cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params;
+
+  // The values set here are factors that will be applied to default values
+  // to get the final value used in the two pass code. 1.0 will hence
+  // match the default behaviour when not using passed in values.
   cpi->twopass.active_wq_factor =
       (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
-  cpi->twopass.base_err_per_mb =
-      (double)cfg->base_err_per_mb.num / (double)cfg->base_err_per_mb.den;
+  cpi->twopass.err_per_mb =
+      (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den;
   cpi->twopass.sr_default_decay_limit =
       (double)cfg->sr_default_decay_limit.num /
       (double)cfg->sr_default_decay_limit.den;
   cpi->twopass.sr_diff_factor =
       (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den;
-  cpi->twopass.kf_err_per_mb =
-      (double)cfg->kf_err_per_mb.num / (double)cfg->kf_err_per_mb.den;
-  cpi->twopass.kf_frame_min_boost =
-      (double)cfg->kf_frame_min_boost.num / (double)cfg->kf_frame_min_boost.den;
+  cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num /
+                               (double)cfg->kf_err_per_mb_factor.den;
+  cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num /
+                                    (double)cfg->kf_frame_min_boost_factor.den;
   cpi->twopass.kf_frame_max_boost_first =
-      (double)cfg->kf_frame_max_boost_first.num /
-      (double)cfg->kf_frame_max_boost_first.den;
+      (double)cfg->kf_frame_max_boost_first_factor.num /
+      (double)cfg->kf_frame_max_boost_first_factor.den;
   cpi->twopass.kf_frame_max_boost_subs =
-      (double)cfg->kf_frame_max_boost_subs.num /
-      (double)cfg->kf_frame_max_boost_subs.den;
-  cpi->twopass.kf_max_total_boost = (int)((double)cfg->kf_max_total_boost.num /
-                                          (double)cfg->kf_max_total_boost.den);
-  cpi->twopass.gf_max_total_boost = (int)((double)cfg->gf_max_total_boost.num /
-                                          (double)cfg->gf_max_total_boost.den);
-  cpi->twopass.gf_frame_max_boost =
-      (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den;
+      (double)cfg->kf_frame_max_boost_subs_factor.num /
+      (double)cfg->kf_frame_max_boost_subs_factor.den;
+  cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num /
+                                    (double)cfg->kf_max_total_boost_factor.den;
+  cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num /
+                                    (double)cfg->gf_max_total_boost_factor.den;
+  cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num /
+                                    (double)cfg->gf_frame_max_boost_factor.den;
   cpi->twopass.zm_factor =
       (double)cfg->zm_factor.num / (double)cfg->zm_factor.den;
   cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num /
@@ -1956,18 +1960,18 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         { 0 },     // layer_taget_bitrate
         0,         // temporal_layering_mode
         0,         // use_vizier_rc_params
-        { 0, 1 },  // active_wq_factor
-        { 0, 1 },  // base_err_per_mb
-        { 0, 1 },  // sr_default_decay_limit
-        { 0, 1 },  // sr_diff_factor
-        { 0, 1 },  // kf_err_per_mb
-        { 0, 1 },  // kf_frame_min_boost
-        { 0, 1 },  // kf_frame_max_boost_first
-        { 0, 1 },  // kf_frame_max_boost_subs
-        { 0, 1 },  // kf_max_total_boost
-        { 0, 1 },  // gf_max_total_boost
-        { 0, 1 },  // gf_frame_max_boost
-        { 0, 1 },  // zm_factor
+        { 1, 1 },  // active_wq_factor
+        { 1, 1 },  // err_per_mb_factor
+        { 1, 1 },  // sr_default_decay_limit
+        { 1, 1 },  // sr_diff_factor
+        { 1, 1 },  // kf_err_per_mb_factor
+        { 1, 1 },  // kf_frame_min_boost_factor
+        { 1, 1 },  // kf_frame_max_boost_first_factor
+        { 1, 1 },  // kf_frame_max_boost_subs_factor
+        { 1, 1 },  // kf_max_total_boost_factor
+        { 1, 1 },  // gf_max_total_boost_factor
+        { 1, 1 },  // gf_frame_max_boost_factor
+        { 1, 1 },  // zm_factor
         { 1, 1 },  // rd_mult_inter_qp_fac
         { 1, 1 },  // rd_mult_arf_qp_fac
         { 1, 1 },  // rd_mult_key_qp_fac
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 255cb693ee..d62d9ee7e1 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -709,14 +709,14 @@ typedef struct vpx_codec_enc_cfg {
    */
   vpx_rational_t active_wq_factor;
 
-  /*!\brief Base error per macroblock.
+  /*!\brief Error per macroblock adjustment factor.
    *
    * Rate control parameters, set from external experiment results.
    * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
    * used. Otherwise, the default value is used.
    *
    */
-  vpx_rational_t base_err_per_mb;
+  vpx_rational_t err_per_mb_factor;
 
   /*!\brief Second reference default decay limit.
    *
@@ -736,68 +736,69 @@ typedef struct vpx_codec_enc_cfg {
    */
   vpx_rational_t sr_diff_factor;
 
-  /*!\brief Keyframe error per macroblock.
+  /*!\brief Keyframe error per macroblock adjustment factor.
    *
    * Rate control parameters, set from external experiment results.
    * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
    * used. Otherwise, the default value is used.
    *
    */
-  vpx_rational_t kf_err_per_mb;
+  vpx_rational_t kf_err_per_mb_factor;
 
-  /*!\brief Keyframe minimum boost.
+  /*!\brief Keyframe minimum boost adjustment factor.
    *
    * Rate control parameters, set from external experiment results.
    * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
    * used. Otherwise, the default value is used.
    *
    */
-  vpx_rational_t kf_frame_min_boost;
+  vpx_rational_t kf_frame_min_boost_factor;
 
-  /*!\brief Keyframe maximum boost, for the first keyframe in a chunk.
+  /*!\brief Keyframe maximum boost adjustment factor, for the first keyframe
+   * in a chunk.
    *
    * Rate control parameters, set from external experiment results.
    * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
    * used. Otherwise, the default value is used.
    *
    */
-  vpx_rational_t kf_frame_max_boost_first;
+  vpx_rational_t kf_frame_max_boost_first_factor;
 
-  /*!\brief Keyframe maximum boost, for subsequent keyframes.
+  /*!\brief Keyframe maximum boost adjustment factor, for subsequent keyframes.
    *
    * Rate control parameters, set from external experiment results.
    * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
    * used. Otherwise, the default value is used.
    *
    */
-  vpx_rational_t kf_frame_max_boost_subs;
+  vpx_rational_t kf_frame_max_boost_subs_factor;
 
-  /*!\brief Keyframe maximum total boost.
+  /*!\brief Keyframe maximum total boost adjustment factor.
    *
    * Rate control parameters, set from external experiment results.
    * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
    * used. Otherwise, the default value is used.
    *
    */
-  vpx_rational_t kf_max_total_boost;
+  vpx_rational_t kf_max_total_boost_factor;
 
-  /*!\brief Golden frame maximum total boost.
+  /*!\brief Golden frame maximum total boost adjustment factor.
    *
    * Rate control parameters, set from external experiment results.
    * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
    * used. Otherwise, the default value is used.
    *
    */
-  vpx_rational_t gf_max_total_boost;
+  vpx_rational_t gf_max_total_boost_factor;
 
-  /*!\brief Golden frame maximum boost.
+  /*!\brief Golden frame maximum boost adjustment factor.
    *
    * Rate control parameters, set from external experiment results.
    * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
    * used. Otherwise, the default value is used.
    *
    */
-  vpx_rational_t gf_frame_max_boost;
+  vpx_rational_t gf_frame_max_boost_factor;
 
   /*!\brief Zero motion power factor.
    *
diff --git a/vpxenc.c b/vpxenc.c
index 1a2b4a9214..276ee9b902 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -292,46 +292,52 @@ static const arg_def_t use_vizier_rc_params =
     ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params");
 static const arg_def_t active_wq_factor =
     ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor");
-static const arg_def_t base_err_per_mb =
-    ARG_DEF(NULL, "base-err-per-mb", 1, "Base error per macroblock");
+static const arg_def_t err_per_mb_factor =
+    ARG_DEF(NULL, "err-per-mb-factor", 1, "Error per macroblock factor");
 static const arg_def_t sr_default_decay_limit = ARG_DEF(
     NULL, "sr-default-decay-limit", 1, "Second reference default decay limit");
 static const arg_def_t sr_diff_factor =
     ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor");
-static const arg_def_t kf_err_per_mb =
-    ARG_DEF(NULL, "kf-err-per-mb", 1, "Keyframe error per macroblock");
-static const arg_def_t kf_frame_min_boost =
-    ARG_DEF(NULL, "kf-frame-min-boost", 1, "Keyframe min boost");
-static const arg_def_t kf_frame_max_boost_first = ARG_DEF(
-    NULL, "kf-frame-max-boost-first", 1, "Max for the first keyframe boost");
-static const arg_def_t kf_frame_max_boost_subs = ARG_DEF(
-    NULL, "kf-frame-max-boost-subs", 1, "Max for subsequent keyframe boost");
-static const arg_def_t kf_max_total_boost =
-    ARG_DEF(NULL, "kf-max-total-boost", 1, "Keyframe max total boost");
-static const arg_def_t gf_max_total_boost =
-    ARG_DEF(NULL, "gf-max-total-boost", 1, "Golden frame max total boost");
-static const arg_def_t gf_frame_max_boost =
-    ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost");
+static const arg_def_t kf_err_per_mb_factor = ARG_DEF(
+    NULL, "kf-err-per-mb-factor", 1, "Keyframe error per macroblock factor");
+static const arg_def_t kf_frame_min_boost_factor =
+    ARG_DEF(NULL, "kf-frame-min-boost-factor", 1, "Keyframe min boost");
+static const arg_def_t kf_frame_max_boost_first_factor =
+    ARG_DEF(NULL, "kf-frame-max-boost-first-factor", 1,
+            "Max keyframe boost adjustment factor for first frame");
+static const arg_def_t kf_frame_max_boost_subs_factor =
+    ARG_DEF(NULL, "kf-frame-max-boost-subs-factor", 1,
+            "Max boost adjustment factor for subsequent KFs");
+static const arg_def_t kf_max_total_boost_factor = ARG_DEF(
+    NULL, "kf-max-total-boost-factor", 1, "Keyframe max total boost factor");
+static const arg_def_t gf_max_total_boost_factor =
+    ARG_DEF(NULL, "gf-max-total-boost-factor", 1,
+            "Golden frame max total boost factor");
+static const arg_def_t gf_frame_max_boost_factor =
+    ARG_DEF(NULL, "gf-frame-max-boost-factor", 1,
+            "Golden frame max per frame boost factor");
 static const arg_def_t zm_factor =
     ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor");
 static const arg_def_t rd_mult_inter_qp_fac =
-    ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1, "RD multiplier for inter frames");
+    ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1,
+            "RD multiplier adjustment for inter frames");
 static const arg_def_t rd_mult_arf_qp_fac =
-    ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1, "RD multiplier for alt-ref frames");
-static const arg_def_t rd_mult_key_qp_fac =
-    ARG_DEF(NULL, "rd-mult-key-qp-fac", 1, "RD multiplier for key frames");
+    ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1,
+            "RD multiplier adjustment for alt-ref frames");
+static const arg_def_t rd_mult_key_qp_fac = ARG_DEF(
+    NULL, "rd-mult-key-qp-fac", 1, "RD multiplier adjustment for key frames");
 static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params,
                                              &active_wq_factor,
-                                             &base_err_per_mb,
+                                             &err_per_mb_factor,
                                              &sr_default_decay_limit,
                                              &sr_diff_factor,
-                                             &kf_err_per_mb,
-                                             &kf_frame_min_boost,
-                                             &kf_frame_max_boost_first,
-                                             &kf_frame_max_boost_subs,
-                                             &kf_max_total_boost,
-                                             &gf_max_total_boost,
-                                             &gf_frame_max_boost,
+                                             &kf_err_per_mb_factor,
+                                             &kf_frame_min_boost_factor,
+                                             &kf_frame_max_boost_first_factor,
+                                             &kf_frame_max_boost_subs_factor,
+                                             &kf_max_total_boost_factor,
+                                             &gf_max_total_boost_factor,
+                                             &gf_frame_max_boost_factor,
                                              &zm_factor,
                                              &rd_mult_inter_qp_fac,
                                              &rd_mult_arf_qp_fac,
@@ -1042,26 +1048,26 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->cfg.use_vizier_rc_params = arg_parse_int(&arg);
     } else if (arg_match(&arg, &active_wq_factor, argi)) {
       config->cfg.active_wq_factor = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &base_err_per_mb, argi)) {
-      config->cfg.base_err_per_mb = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &err_per_mb_factor, argi)) {
+      config->cfg.err_per_mb_factor = arg_parse_rational(&arg);
     } else if (arg_match(&arg, &sr_default_decay_limit, argi)) {
       config->cfg.sr_default_decay_limit = arg_parse_rational(&arg);
     } else if (arg_match(&arg, &sr_diff_factor, argi)) {
       config->cfg.sr_diff_factor = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &kf_err_per_mb, argi)) {
-      config->cfg.kf_err_per_mb = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &kf_frame_min_boost, argi)) {
-      config->cfg.kf_frame_min_boost = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &kf_frame_max_boost_first, argi)) {
-      config->cfg.kf_frame_max_boost_first = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &kf_frame_max_boost_subs, argi)) {
-      config->cfg.kf_frame_max_boost_subs = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &kf_max_total_boost, argi)) {
-      config->cfg.kf_max_total_boost = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &gf_max_total_boost, argi)) {
-      config->cfg.gf_max_total_boost = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &gf_frame_max_boost, argi)) {
-      config->cfg.gf_frame_max_boost = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_err_per_mb_factor, argi)) {
+      config->cfg.kf_err_per_mb_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_min_boost_factor, argi)) {
+      config->cfg.kf_frame_min_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_first_factor, argi)) {
+      config->cfg.kf_frame_max_boost_first_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_subs_factor, argi)) {
+      config->cfg.kf_frame_max_boost_subs_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_max_total_boost_factor, argi)) {
+      config->cfg.kf_max_total_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_max_total_boost_factor, argi)) {
+      config->cfg.gf_max_total_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_frame_max_boost_factor, argi)) {
+      config->cfg.gf_frame_max_boost_factor = arg_parse_rational(&arg);
     } else if (arg_match(&arg, &zm_factor, argi)) {
       config->cfg.zm_factor = arg_parse_rational(&arg);
     } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) {

From adc185feb7b7aabd6407cc41ad1f3c1e9c1e8b17 Mon Sep 17 00:00:00 2001
From: Sreerenj Balachandran <bsreerenj@gmail.com>
Date: Wed, 21 Apr 2021 11:34:03 -0700
Subject: [PATCH 080/926] vp8: enc: Fix valid range for under/over_shoot pct

The overshoot_pct & undershoot_pct attributes for rate control
are expressed as a percentage of the target bitrate, so the range
should be 0-100.

Change-Id: I67af3c8be7ab814c711c2eaf30786f1e2fa4f5a3
---
 vp8/vp8_cx_iface.c | 4 ++--
 vpx/vpx_encoder.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 872710f138..06ee5d7dfb 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -152,8 +152,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, g_lag_in_frames, 25);
 #endif
   RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
-  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000);
-  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
   RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
 
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 255cb693ee..1682931a7f 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -498,7 +498,7 @@ typedef struct vpx_codec_enc_cfg {
    * undershoot level (current rate vs target) beyond which more aggressive
    * corrective measures are taken.
    *   *
-   * Valid values in the range VP8:0-1000 VP9: 0-100.
+   * Valid values in the range VP8:0-100 VP9: 0-100.
    */
   unsigned int rc_undershoot_pct;
 
@@ -513,7 +513,7 @@ typedef struct vpx_codec_enc_cfg {
    * overshoot level (current rate vs target) beyond which more aggressive
    * corrective measures are taken.
    *
-   * Valid values in the range VP8:0-1000 VP9: 0-100.
+   * Valid values in the range VP8:0-100 VP9: 0-100.
    */
   unsigned int rc_overshoot_pct;
 

From c15555c62fc35872a4ff803c4d57ef5d2f4f81d9 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 23 Apr 2021 16:50:48 -0700
Subject: [PATCH 081/926] sync CONTRIBUTING.md w/libwebm

Change-Id: I63ffea52d079b0d50002526e209ae3fb64811bac
---
 CONTRIBUTING.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 577c96a6b5..7a73a30317 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,10 +19,9 @@ again.
 
 All submissions, including submissions by project members, require review. We
 use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
-https://chromium-review.googlesource.com for this purpose.
-
-See https://www.webmproject.org/code/contribute/submitting-patches for an
-example of a typical gerrit workflow.
+https://chromium-review.googlesource.com for this purpose. See the
+[WebM Project page](https://www.webmproject.org/code/contribute/submitting-patches/)
+for additional details.
 
 ## Community Guidelines
 

From 9f57bc4d6c7a577538042a49ede8ee98dc8cc300 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Mon, 26 Apr 2021 15:06:54 +0100
Subject: [PATCH 082/926] Add limits to Vizier input parameters.

Imposed provisional upper and lower limits to each parameter
that can be adjusted in the Vizier ML experiment.

Also in some cases applied secondary limits on on the
range of the final "used" values.

Defaults and limits may well require further tuning after
subsequent rounds of experimentation.

Re-factor get_sr_decay_rate().

Change-Id: I28e804ce3d3710f30cd51a203348e4ab23ef06c0
---
 vp9/encoder/vp9_firstpass.c | 41 +++++++++++--------
 vp9/vp9_cx_iface.c          | 79 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a4717ad036..ce7590fe4c 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1832,33 +1832,35 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 }
 
 /* This function considers how the quality of prediction may be deteriorating
- * with distance. It comapres the coded error for the last frame and the
+ * with distance. It compares the coded error for the last frame and the
  * second reference frame (usually two frames old) and also applies a factor
  * based on the extent of INTRA coding.
  *
  * The decay factor is then used to reduce the contribution of frames further
- * from the alt-ref or golden frame, to the bitframe boost calculation for that
+ * from the alt-ref or golden frame, to the bitrate boost calculation for that
  * alt-ref or golden frame.
  */
 static double get_sr_decay_rate(const TWO_PASS *const twopass,
                                 const FIRSTPASS_STATS *frame) {
   double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
-  double modified_pct_inter;
-  double modified_pcnt_intra;
-
-  modified_pct_inter = frame->pcnt_inter;
-  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
-      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-       (double)NCOUNT_FRAME_II_THRESH)) {
-    modified_pct_inter =
-        frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
-  }
-  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
 
+  // Do nothing if the second ref to last frame error difference is
+  // very small or even negative.
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    double sr_diff_part =
+    const double sr_diff_part =
         twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error);
+    double modified_pct_inter = frame->pcnt_inter;
+    double modified_pcnt_intra;
+
+    if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+        ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+         (double)NCOUNT_FRAME_II_THRESH)) {
+      modified_pct_inter =
+          frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
+    }
+    modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
     sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
   }
   return VPXMAX(sr_decay, twopass->sr_default_decay_limit);
@@ -1979,7 +1981,7 @@ static double calc_frame_boost(const FRAME_INFO *frame_info,
   const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
   const double active_area = calculate_active_area(frame_info, this_frame);
 
-  // Underlying boost factor is based on inter error ratio.
+  // Frame booost is based on inter error.
   frame_boost = (twopass->err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
 
@@ -2007,7 +2009,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
       calculate_active_area(&cpi->frame_info, this_frame);
   double max_boost;
 
-  // Underlying boost factor is based on inter error ratio.
+  // Frame booost is based on inter error.
   frame_boost = (twopass->kf_err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
 
@@ -3499,14 +3501,21 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
     twopass->active_wq_factor *= AV_WQ_FACTOR;
     twopass->err_per_mb *= BASELINE_ERR_PER_MB;
     twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT;
+    if (twopass->sr_default_decay_limit > 1.0)  // > 1.0 here makes no sense
+      twopass->sr_default_decay_limit = 1.0;
     twopass->sr_diff_factor *= 1.0;
     twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST;
     twopass->gf_max_total_boost *= MAX_GF_BOOST;
+    // NOTE: In use max boost has precedence over min boost. So even if min is
+    // somehow set higher than max the final boost value will be clamped to the
+    // appropriate maximum.
     twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST;
     twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST;
     twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST;
     twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST;
     twopass->zm_factor *= DEFAULT_ZM_FACTOR;
+    if (twopass->zm_factor > 1.0)  // > 1.0 here makes no sense
+      twopass->zm_factor = 1.0;
 
     // Correction for the fact that the kf_err_per_mb_factor default is
     // already different for different video formats and ensures that a passed
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index c700620ef3..438f9b5ed9 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -664,41 +664,118 @@ static vpx_codec_err_t set_twopass_params_from_config(
   cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params;
 
   // The values set here are factors that will be applied to default values
-  // to get the final value used in the two pass code. 1.0 will hence
+  // to get the final value used in the two pass code. Hence 1.0 will
   // match the default behaviour when not using passed in values.
+  // We also apply limits here to prevent the user from applying settings
+  // that make no sense.
   cpi->twopass.active_wq_factor =
       (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
+  if (cpi->twopass.active_wq_factor < 0.25)
+    cpi->twopass.active_wq_factor = 0.25;
+  else if (cpi->twopass.active_wq_factor > 16.0)
+    cpi->twopass.active_wq_factor = 16.0;
+
   cpi->twopass.err_per_mb =
       (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den;
+  if (cpi->twopass.err_per_mb < 0.25)
+    cpi->twopass.err_per_mb = 0.25;
+  else if (cpi->twopass.err_per_mb > 4.0)
+    cpi->twopass.err_per_mb = 4.0;
+
   cpi->twopass.sr_default_decay_limit =
       (double)cfg->sr_default_decay_limit.num /
       (double)cfg->sr_default_decay_limit.den;
+  if (cpi->twopass.sr_default_decay_limit < 0.25)
+    cpi->twopass.sr_default_decay_limit = 0.25;
+  // If the default changes this will need to change.
+  else if (cpi->twopass.sr_default_decay_limit > 1.33)
+    cpi->twopass.sr_default_decay_limit = 1.33;
+
   cpi->twopass.sr_diff_factor =
       (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den;
+  if (cpi->twopass.sr_diff_factor < 0.25)
+    cpi->twopass.sr_diff_factor = 0.25;
+  else if (cpi->twopass.sr_diff_factor > 4.0)
+    cpi->twopass.sr_diff_factor = 4.0;
+
   cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num /
                                (double)cfg->kf_err_per_mb_factor.den;
+  if (cpi->twopass.kf_err_per_mb < 0.25)
+    cpi->twopass.kf_err_per_mb = 0.25;
+  else if (cpi->twopass.kf_err_per_mb > 4.0)
+    cpi->twopass.kf_err_per_mb = 4.0;
+
   cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num /
                                     (double)cfg->kf_frame_min_boost_factor.den;
+  if (cpi->twopass.kf_frame_min_boost < 0.25)
+    cpi->twopass.kf_frame_min_boost = 0.25;
+  else if (cpi->twopass.kf_frame_min_boost > 4.0)
+    cpi->twopass.kf_frame_min_boost = 4.0;
+
   cpi->twopass.kf_frame_max_boost_first =
       (double)cfg->kf_frame_max_boost_first_factor.num /
       (double)cfg->kf_frame_max_boost_first_factor.den;
+  if (cpi->twopass.kf_frame_max_boost_first < 0.25)
+    cpi->twopass.kf_frame_max_boost_first = 0.25;
+  else if (cpi->twopass.kf_frame_max_boost_first > 4.0)
+    cpi->twopass.kf_frame_max_boost_first = 4.0;
+
   cpi->twopass.kf_frame_max_boost_subs =
       (double)cfg->kf_frame_max_boost_subs_factor.num /
       (double)cfg->kf_frame_max_boost_subs_factor.den;
+  if (cpi->twopass.kf_frame_max_boost_subs < 0.25)
+    cpi->twopass.kf_frame_max_boost_subs = 0.25;
+  else if (cpi->twopass.kf_frame_max_boost_subs > 4.0)
+    cpi->twopass.kf_frame_max_boost_subs = 4.0;
+
   cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num /
                                     (double)cfg->kf_max_total_boost_factor.den;
+  if (cpi->twopass.kf_max_total_boost < 0.25)
+    cpi->twopass.kf_max_total_boost = 0.25;
+  else if (cpi->twopass.kf_max_total_boost > 4.0)
+    cpi->twopass.kf_max_total_boost = 4.0;
+
   cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num /
                                     (double)cfg->gf_max_total_boost_factor.den;
+  if (cpi->twopass.gf_max_total_boost < 0.25)
+    cpi->twopass.gf_max_total_boost = 0.25;
+  else if (cpi->twopass.gf_max_total_boost > 4.0)
+    cpi->twopass.gf_max_total_boost = 4.0;
+
   cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num /
                                     (double)cfg->gf_frame_max_boost_factor.den;
+  if (cpi->twopass.gf_frame_max_boost < 0.25)
+    cpi->twopass.gf_frame_max_boost = 0.25;
+  else if (cpi->twopass.gf_frame_max_boost > 4.0)
+    cpi->twopass.gf_frame_max_boost = 4.0;
+
   cpi->twopass.zm_factor =
       (double)cfg->zm_factor.num / (double)cfg->zm_factor.den;
+  if (cpi->twopass.zm_factor < 0.25)
+    cpi->twopass.zm_factor = 0.25;
+  else if (cpi->twopass.zm_factor > 2.0)
+    cpi->twopass.zm_factor = 2.0;
+
   cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num /
                                       (double)cfg->rd_mult_inter_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_inter_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_inter_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_inter_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_inter_qp_fac = 4.0;
+
   cpi->rd_ctrl.rd_mult_arf_qp_fac =
       (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_arf_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_arf_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_arf_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_arf_qp_fac = 4.0;
+
   cpi->rd_ctrl.rd_mult_key_qp_fac =
       (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_key_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_key_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_key_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_key_qp_fac = 4.0;
 
   return VPX_CODEC_OK;
 }

From 5ca8c5f31d2d1c5d9760577a651e0d710cef7663 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 27 Apr 2021 18:02:35 -0700
Subject: [PATCH 083/926] vpx_convolve_copy_neon: prefer != 0 to > 0 in tests

this produces better assembly code

Change-Id: I80ed1a165512e941b35a4965faa0c44403357e91
---
 vpx_dsp/arm/vpx_convolve_copy_neon.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c
index 7abed67a40..361ec8a806 100644
--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -33,7 +33,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += src_stride;
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // copy8
     uint8x8_t s0, s1;
     do {
@@ -47,7 +47,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_u8(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // copy16
     uint8x16_t s0, s1;
     do {
@@ -61,7 +61,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 32) {  // copy32
     uint8x16_t s0, s1, s2, s3;
     do {
@@ -79,7 +79,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst + 16, s3);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else {  // copy64
     uint8x16_t s0, s1, s2, s3;
     do {

From 07cf024d4d061feac503054d15d039c2cfbce35e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 27 Apr 2021 18:02:35 -0700
Subject: [PATCH 084/926] vpx_convolve_avg_neon: prefer != 0 to > 0 in tests

this produces better assembly code

Change-Id: I174b67a595d7efeb60c921f066302043b1c7d84e
---
 vpx_dsp/arm/vpx_convolve_avg_neon.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c
index 07349d03ae..8e3ee599f4 100644
--- a/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -43,7 +43,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // avg8
     uint8x8_t s0, s1, d0, d1;
     uint8x16_t s01, d01;
@@ -64,7 +64,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_u8(dst, vget_high_u8(d01));
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // avg16
     uint8x16_t s0, s1, d0, d1;
     do {
@@ -83,7 +83,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst, d1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 32) {  // avg32
     uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
     do {
@@ -110,7 +110,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst + 16, d3);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else {  // avg64
     uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
     do {

From ff67c848115ae1356f21d361342091140d176c1d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 27 Apr 2021 18:02:35 -0700
Subject: [PATCH 085/926] vpx_convolve_neon: prefer != 0 to > 0 in tests

this produces better assembly code; the horizontal convolve is called
with an adjusted intermediate_height where it may over process some rows
so the checks in those functions remain.

Change-Id: Iebe9842f2a13a4960d9a5addde9489452f5ce33a
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 08ae17dbab..c55c9fb568 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -145,7 +145,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4;
       dst += 4;
       w -= 4;
-    } while (w > 0);
+    } while (w != 0);
   } else {
     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
@@ -296,7 +296,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s += 8;
           d += 8;
           width -= 8;
-        } while (width > 0);
+        } while (width != 0);
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
@@ -402,7 +402,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4;
       dst += 4;
       w -= 4;
-    } while (w > 0);
+    } while (w != 0);
   } else {
     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
@@ -586,7 +586,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s += 8;
           d += 8;
           width -= 8;
-        } while (width > 0);
+        } while (width != 0);
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
@@ -679,7 +679,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s5 = s9;
       s6 = s10;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
@@ -759,11 +759,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5 = s9;
         s6 = s10;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -860,7 +860,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s5 = s9;
       s6 = s10;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
@@ -950,10 +950,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5 = s9;
         s6 = s10;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }

From 4ec84326cc65c5b042bf06d222d51e51d7e5461d Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Wed, 28 Apr 2021 13:54:07 -0700
Subject: [PATCH 086/926] Bump ABI version

Due to recent changes to command line options for rate control
parameters.

Change-Id: I1de7cb4ff2850a3ed19ec216dd9d07f64a118e92
---
 vpx/vpx_encoder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index c4589bea10..f8fdfc0307 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -58,7 +58,7 @@ extern "C" {
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (14 + VPX_CODEC_ABI_VERSION + \
+  (15 + VPX_CODEC_ABI_VERSION + \
    VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield

From e14026ac21a6b6c93fd0fc954055643ed30972e0 Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Thu, 29 Apr 2021 11:06:16 +0100
Subject: [PATCH 087/926] Add assert for zero_motion_factor range

Change clamp to an assert so we are warned if changes to input
ranges or defaults in the future lead to an invalid value.

Change-Id: Idb4e0729f477a519bfff3083cdce3891e2fc6faa
---
 vp9/encoder/vp9_firstpass.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index ce7590fe4c..b63d47a05e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1882,13 +1882,8 @@ static double get_prediction_decay_rate(const TWO_PASS *const twopass,
   double zero_motion_factor =
       twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
 
-  // Clamp value to range 0.0 to 1.0
-  // This should happen anyway if input values are sensibly clamped but checked
-  // here just in case.
-  if (zero_motion_factor > 1.0)
-    zero_motion_factor = 1.0;
-  else if (zero_motion_factor < 0.0)
-    zero_motion_factor = 0.0;
+  // Check that the zero motion factor is valid
+  assert(zero_motion_factor >= 0.0 && zero_motion_factor <= 1.0);
 
   return VPXMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));

From abc7105acdfbbeaeecf41c675148683a1cb8b4f7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 4 May 2021 12:10:21 -0700
Subject: [PATCH 088/926] test.mk: enable vp9_denoiser_test w/NEON

this file uses GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST so it's
safe to enable unconditionally. the filter check fell out of sync with
the code, there's a sse2 and neon implementation for the filter.

Change-Id: I2a3336ccef3fb524ca5d9b8f88279240c9a276aa
---
 test/test.mk | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/test.mk b/test/test.mk
index 04902382db..b0319fb0de 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -193,10 +193,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_NON_GREEDY_MV) += non_greedy_mv_test.cc
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
-ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2)))
 LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc
 endif
-endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)

From 12a14913947b510514746389319b49a188a53579 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 4 May 2021 12:13:17 -0700
Subject: [PATCH 089/926] vp9_denoiser_neon,horizontal_add_s8x16: use vaddlv
 w/aarch64

this reduces the number of instructions to compute the sum

Change-Id: Icae4d4fb3e343d5b6e5a095c60ac6d171b3e7d54
---
 vp9/encoder/arm/neon/vp9_denoiser_neon.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
index 4152e7bb5d..53e8c7e498 100644
--- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -21,6 +21,9 @@
 
 // Compute the sum of all pixel differences of this MB.
 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if defined(__aarch64__)
+  return vaddlvq_s8(v_sum_diff_total);
+#else
   const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
   const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
   const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
@@ -28,6 +31,7 @@ static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
                                 vget_low_s64(fedcba98_76543210));
   const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
   return sum_diff;
+#endif
 }
 
 // Denoise a 16x1 vector.

From c1f77a3689a6cf5e95e1c1ae35d76f4f171f5ef3 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sun, 11 Apr 2021 15:20:36 +0100
Subject: [PATCH 090/926] Implement horizontal convolution using Neon SDOT
 instruction

Add an alternative AArch64 implementation of vpx_convolve8_horiz_neon
for targets that implement the Armv8.4-A SDOT (signed dot product)
instruction.

The existing MLA-based implementation of vpx_convolve8_horiz_neon is
retained and used on target CPUs that do not implement the SDOT
instruction (or CPUs executing in AArch32 mode). The availability of
the SDOT instruction is indicated by the feature macro
__ARM_FEATURE_DOTPROD.

Co-authored by: James Greenhalgh <james.greenhalgh@arm.com>

Change-Id: I5337286b0f5f2775ad7cdbc0174785ae694363cc
---
 vpx_dsp/arm/mem_neon.h           |  18 +++++
 vpx_dsp/arm/vpx_convolve8_neon.c | 109 +++++++++++++++++++++++++++++++
 vpx_dsp/arm/vpx_convolve8_neon.h |  63 ++++++++++++++++++
 3 files changed, 190 insertions(+)

diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 943865b3c2..c89f92d1ad 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -19,6 +19,24 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+// Support for these xN intrinsics is lacking in older versions of GCC.
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ < 8 || defined(__arm__)
+static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+#endif
+
+#if __GNUC__ < 9 || defined(__arm__)
+static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+#endif
+#endif
+
 static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
                                           const int16_t c2, const int16_t c3) {
   return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index c55c9fb568..a86adb4e72 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"
@@ -52,6 +53,112 @@ static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
   vst1_u8(s, s7);
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23;
+
+      s0 = vld1q_u8(src);
+      src += src_stride;
+      s1 = vld1q_u8(src);
+      src += src_stride;
+      s2 = vld1q_u8(src);
+      src += src_stride;
+      s3 = vld1q_u8(src);
+      src += src_stride;
+
+      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
+        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
+        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
+        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+#else
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -305,6 +412,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+#endif
+
 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *filter, int x0_q4,
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 4f27da9d2f..14e7488540 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -72,6 +72,69 @@ static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
   *s7 = vld1q_u8(s);
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
+static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
+                                        const int8x8_t filters,
+                                        const int32x4_t correction,
+                                        const uint8x16_t range_limit,
+                                        const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
+                                        const int8x8_t filters,
+                                        const int32x4_t correction,
+                                        const uint8x16_t range_limit,
+                                        const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+#endif
+
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,

From a28d43658e3347d55d70655e6ee3d87d0d3fba8a Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 6 May 2021 14:51:05 +0100
Subject: [PATCH 091/926] Optimize Neon SAD reductions using wider ADDP
 instruction

Implement AArch64-only paths for each of the Neon SAD reduction
functions, making use of a wider pairwise addition instruction only
available on AArch64.

This change removes the need for shuffling between high and low
halves of Neon vectors - resulting in a faster reduction that requires
fewer instructions.

Bug: b/181236880
Change-Id: I1c48580b4aec27222538eeab44e38ecc1f2009dc
---
 vpx_dsp/arm/sad4d_neon.c | 53 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 06443c6995..34c0a7adef 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -34,7 +34,9 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
                             uint32_t *const res) {
   int i;
   uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+#if !defined(__aarch64__)
   uint16x4_t a[2];
+#endif
   uint32x4_t r;
 
   assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
@@ -51,9 +53,14 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
     abs[1] = vabal_u8(abs[1], s, ref23);
   }
 
+#if defined(__aarch64__)
+  abs[0] = vpaddq_u16(abs[0], abs[1]);
+  r = vpaddlq_u16(abs[0]);
+#else
   a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
   a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
   r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
+#endif
   vst1q_u32(res, r);
 }
 
@@ -74,6 +81,12 @@ void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
 // Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
 static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                           uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  const uint32x4_t r = vpaddlq_u16(b0);
+#else
   const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
   const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
   const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
@@ -81,12 +94,21 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
   const uint16x4_t b0 = vpadd_u16(a0, a1);
   const uint16x4_t b1 = vpadd_u16(a2, a3);
   const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
   vst1q_u32(res, r);
 }
 
 // Can handle 1024 pixels' sad sum (such as 32x32)
 static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint32x4_t b0 = vpaddlq_u16(a0);
+  const uint32x4_t b1 = vpaddlq_u16(a1);
+  const uint32x4_t r = vpaddq_u32(b0, b1);
+  vst1q_u32(res, r);
+#else
   const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
   const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
   const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
@@ -96,11 +118,22 @@ static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
   const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
   const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
   vst1q_u32(res, vcombine_u32(c0, c1));
+#endif
 }
 
 // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
 static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x4_t b0 = vpaddq_u32(a0, a1);
+  const uint32x4_t b1 = vpaddq_u32(a2, a3);
+  const uint32x4_t r = vpaddq_u32(b0, b1);
+  vst1q_u32(res, r);
+#else
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
   const uint32x4_t a2 = vpaddlq_u16(sum[2]);
@@ -112,11 +145,30 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
   const uint32x2_t c0 = vpadd_u32(b0, b1);
   const uint32x2_t c1 = vpadd_u32(b2, b3);
   vst1q_u32(res, vcombine_u32(c0, c1));
+#endif
 }
 
 // Can handle 4096 pixels' sad sum (such as 64x64)
 static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
+  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
+  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
+  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
+  const uint32x4_t b0 = vaddq_u32(a0, a1);
+  const uint32x4_t b1 = vaddq_u32(a2, a3);
+  const uint32x4_t b2 = vaddq_u32(a4, a5);
+  const uint32x4_t b3 = vaddq_u32(a6, a7);
+  const uint32x4_t c0 = vpaddq_u32(b0, b1);
+  const uint32x4_t c1 = vpaddq_u32(b2, b3);
+  const uint32x4_t r = vpaddq_u32(c0, c1);
+  vst1q_u32(res, r);
+#else
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
   const uint32x4_t a2 = vpaddlq_u16(sum[2]);
@@ -136,6 +188,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
   const uint32x2_t d0 = vpadd_u32(c0, c1);
   const uint32x2_t d1 = vpadd_u32(c2, c3);
   vst1q_u32(res, vcombine_u32(d0, d1));
+#endif
 }
 
 static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,

From f7364c05748b70a1e0fd57849665a9d9f0990803 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 6 May 2021 15:11:52 +0100
Subject: [PATCH 092/926] Manually unroll the inner loop of Neon sad16x_4d()

Manually unrolling the inner loop is sufficient to stop the compiler
getting confused and emitting inefficient code.

Co-authored by: James Greenhalgh <james.greenhalgh@arm.com>

Bug: b/181236880
Change-Id: I860768ce0e6c0e0b6286d3fc1b94f0eae95d0a1a
---
 vpx_dsp/arm/sad4d_neon.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 34c0a7adef..256bc41ce7 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -243,7 +243,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
 static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
                              uint32_t *res, const int height) {
-  int i, j;
+  int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
@@ -252,10 +252,15 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
   for (i = 0; i < height; ++i) {
     const uint8x16_t s = vld1q_u8(src_ptr);
     src_ptr += src_stride;
-    for (j = 0; j < 4; ++j) {
-      sad16_neon(ref_loop[j], s, &sum[j]);
-      ref_loop[j] += ref_stride;
-    }
+    /* Manual unrolling here stops the compiler from getting confused. */
+    sad16_neon(ref_loop[0], s, &sum[0]);
+    ref_loop[0] += ref_stride;
+    sad16_neon(ref_loop[1], s, &sum[1]);
+    ref_loop[1] += ref_stride;
+    sad16_neon(ref_loop[2], s, &sum[2]);
+    ref_loop[2] += ref_stride;
+    sad16_neon(ref_loop[3], s, &sum[3]);
+    ref_loop[3] += ref_stride;
   }
 
   sad_512_pel_final_neon(sum, res);

From 43df64a9ac32491a25772ac9c678f45b2b7004d2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 7 May 2021 19:35:25 -0700
Subject: [PATCH 093/926] img_alloc_helper: make align var unsigned

quiets an integer sanitizer warning:
vpx/src/vpx_image.c:101:25: runtime error: implicit conversion from
type 'int' of value -2 (32-bit, signed) to type 'unsigned int' changed
the value to 4294967294 (32-bit, unsigned)

Change-Id: Ifeac31cc80811081c1ba10aadaa94dc36cd46efa
---
 vpx/src/vpx_image.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 2b7411f94f..2a7afc00c2 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -22,7 +22,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
                                      unsigned char *img_data) {
   unsigned int h, w, s, xcs, ycs, bps;
   unsigned int stride_in_bytes;
-  int align;
+  unsigned int align;
 
   if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
 

From 0f563e5fadbccb10fabd6ac80c256a4321401e22 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Fri, 7 May 2021 13:25:51 +0100
Subject: [PATCH 094/926] Optimize Neon reductions in sum_neon.h using ADDV
 instruction

Use the AArch64-only ADDV and ADDLV instructions to accelerate
reductions that add across a Neon vector in sum_neon.h. This commit
also refactors the inline functions to return a scalar instead of a
vector - allowing for optimization of the surrounding code at each
call site.

Bug: b/181236880
Change-Id: Ieed2a2dd3c74f8a52957bf404141ffc044bd5d79
---
 vpx_dsp/arm/avg_neon.c          | 11 +++--------
 vpx_dsp/arm/fdct_partial_neon.c | 32 +++++++++++---------------------
 vpx_dsp/arm/sad_neon.c          | 24 ++++++++++++------------
 vpx_dsp/arm/sum_neon.h          | 33 ++++++++++++++++++++++++---------
 vpx_dsp/arm/variance_neon.c     | 21 +++++++++------------
 5 files changed, 59 insertions(+), 62 deletions(-)

diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index fa7dd09600..8e57bdaa50 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -22,15 +22,13 @@
 uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
   const uint8x16_t b = load_unaligned_u8q(a, a_stride);
   const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-  const uint32x2_t d = horizontal_add_uint16x8(c);
-  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
+  return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4;
 }
 
 uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
   int i;
   uint8x8_t b, c;
   uint16x8_t sum;
-  uint32x2_t d;
   b = vld1_u8(a);
   a += a_stride;
   c = vld1_u8(a);
@@ -43,9 +41,7 @@ uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
     sum = vaddw_u8(sum, d);
   }
 
-  d = horizontal_add_uint16x8(sum);
-
-  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+  return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
 }
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
@@ -139,8 +135,7 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
     ref += 16;
   }
 
-  return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)),
-                       0);
+  return (int16_t)horizontal_add_uint16x8(vec_sum);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits
diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c
index e73de41d77..0a1cdca41d 100644
--- a/vpx_dsp/arm/fdct_partial_neon.c
+++ b/vpx_dsp/arm/fdct_partial_neon.c
@@ -15,19 +15,10 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE tran_low_t get_lane(const int32x2_t a) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return vget_lane_s32(a, 0);
-#else
-  return vget_lane_s16(vreinterpret_s16_s32(a), 0);
-#endif  // CONFIG_VP9_HIGHBITDETPH
-}
-
 void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x4_t a0, a1, a2, a3;
   int16x8_t b0, b1;
   int16x8_t c;
-  int32x2_t d;
 
   a0 = vld1_s16(input);
   input += stride;
@@ -42,9 +33,7 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   c = vaddq_s16(b0, b1);
 
-  d = horizontal_add_int16x8(c);
-
-  output[0] = get_lane(vshl_n_s32(d, 1));
+  output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1);
   output[1] = 0;
 }
 
@@ -57,7 +46,7 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
     sum = vaddq_s16(sum, input_00);
   }
 
-  output[0] = get_lane(horizontal_add_int16x8(sum));
+  output[0] = (tran_low_t)horizontal_add_int16x8(sum);
   output[1] = 0;
 }
 
@@ -66,7 +55,7 @@ void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
   int r;
   int16x8_t left = vld1q_s16(input);
   int16x8_t right = vld1q_s16(input + 8);
-  int32x2_t sum;
+  int32_t sum;
   input += stride;
 
   for (r = 1; r < 16; ++r) {
@@ -77,9 +66,9 @@ void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
     right = vaddq_s16(right, b);
   }
 
-  sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
+  sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right);
 
-  output[0] = get_lane(vshr_n_s32(sum, 1));
+  output[0] = (tran_low_t)(sum >> 1);
   output[1] = 0;
 }
 
@@ -90,7 +79,7 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
   int16x8_t a1 = vld1q_s16(input + 8);
   int16x8_t a2 = vld1q_s16(input + 16);
   int16x8_t a3 = vld1q_s16(input + 24);
-  int32x2_t sum;
+  int32_t sum;
   input += stride;
 
   for (r = 1; r < 32; ++r) {
@@ -105,9 +94,10 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
     a3 = vaddq_s16(a3, b3);
   }
 
-  sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
-  sum = vadd_s32(sum, horizontal_add_int16x8(a2));
-  sum = vadd_s32(sum, horizontal_add_int16x8(a3));
-  output[0] = get_lane(vshr_n_s32(sum, 3));
+  sum = horizontal_add_int16x8(a0);
+  sum += horizontal_add_int16x8(a1);
+  sum += horizontal_add_int16x8(a2);
+  sum += horizontal_add_int16x8(a3);
+  output[0] = (tran_low_t)(sum >> 3);
   output[1] = 0;
 }
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index c4a49e366d..59567bda5b 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -23,7 +23,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint16x8(abs);
 }
 
 uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
@@ -35,7 +35,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
   const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint16x8(abs);
 }
 
 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
@@ -51,7 +51,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
   }
 
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint16x8(abs);
 }
 
 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
@@ -71,7 +71,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
   }
 
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint16x8(abs);
 }
 
 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
@@ -114,7 +114,7 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
   uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
                                const uint8_t *ref_ptr, int ref_stride) {       \
     const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
+    return horizontal_add_uint16x8(abs);                                       \
   }                                                                            \
                                                                                \
   uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
@@ -122,7 +122,7 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *second_pred) {               \
     const uint16x8_t abs =                                                     \
         sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
+    return horizontal_add_uint16x8(abs);                                       \
   }
 
 sad8xN(4);
@@ -172,7 +172,7 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint16x8_t abs =                                                    \
         sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+    return horizontal_add_uint16x8(abs);                                      \
   }                                                                           \
                                                                               \
   uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
@@ -180,7 +180,7 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
                                     const uint8_t *second_pred) {             \
     const uint16x8_t abs =                                                    \
         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+    return horizontal_add_uint16x8(abs);                                      \
   }
 
 sad16xN(8);
@@ -240,7 +240,7 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint16x8_t abs =                                                    \
         sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+    return horizontal_add_uint16x8(abs);                                      \
   }                                                                           \
                                                                               \
   uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
@@ -248,7 +248,7 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
                                     const uint8_t *second_pred) {             \
     const uint16x8_t abs =                                                    \
         sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+    return horizontal_add_uint16x8(abs);                                      \
   }
 
 sad32xN(16);
@@ -338,7 +338,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint32x4_t abs =                                                    \
         sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
+    return horizontal_add_uint32x4(abs);                                      \
   }                                                                           \
                                                                               \
   uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
@@ -346,7 +346,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
                                     const uint8_t *second_pred) {             \
     const uint32x4_t abs =                                                    \
         sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
+    return horizontal_add_uint32x4(abs);                                      \
   }
 
 sad64xN(32);
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 9e6833aad3..630296237b 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -16,23 +16,38 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) {
+static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_s16(a);
+#else
   const int32x4_t b = vpaddlq_s16(a);
   const int64x2_t c = vpaddlq_s32(b);
-  return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
-                  vreinterpret_s32_s64(vget_high_s64(c)));
+  const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+                               vreinterpret_s32_s64(vget_high_s64(c)));
+  return vget_lane_s32(d, 0);
+#endif
 }
 
-static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
+static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(a);
+#else
   const uint32x4_t b = vpaddlq_u16(a);
   const uint64x2_t c = vpaddlq_u32(b);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
-                  vreinterpret_u32_u64(vget_high_u64(c)));
+  const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                                vreinterpret_u32_u64(vget_high_u64(c)));
+  return vget_lane_u32(d, 0);
+#endif
 }
 
-static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
   const uint64x2_t b = vpaddlq_u32(a);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                  vreinterpret_u32_u64(vget_high_u64(b)));
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
 }
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 77b1015b74..e08f31f2c5 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -66,10 +66,9 @@ static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
     ref_ptr += 4 * ref_stride;
   }
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_uint32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
 // Process a block of any size where the width is divisible by 16.
@@ -115,10 +114,9 @@ static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
     ref_ptr += ref_stride;
   }
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_uint32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
 // Process a block of width 8 two rows at a time.
@@ -157,10 +155,9 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
     i += 2;
   } while (i < h);
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_uint32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,

From 2db85c269bc5479e48ea7cd4fde85236ee0bc347 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 10 May 2021 12:22:03 +0100
Subject: [PATCH 095/926] Use ABD and UDOT to implement Neon sad_4d functions

Implementing sad16_neon using ABD, UDOT instead of ABAL, ABAL2 saves
a cycle and removes resource contention for a single SIMD pipe on
modern out-of-order Arm CPUs. The UDOT accumulation into 32-bit
elements also allows for a faster reduction at the end of each SAD
function.

The existing implementation is retained for CPUs that do not
implement the Armv8.4-A UDOT instruction, and CPUs executing in
AArch32 mode.

Bug: b/181236880
Change-Id: Ibd0da46e86751d2f808c7b1e424f82b046a1aa6f
---
 vpx_dsp/arm/sad4d_neon.c | 214 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 214 insertions(+)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 256bc41ce7..5c7a0fcaf0 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -98,6 +98,8 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
   vst1q_u32(res, r);
 }
 
+#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
+
 // Can handle 1024 pixels' sad sum (such as 32x32)
 static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                            uint32_t *const res) {
@@ -191,6 +193,8 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
 #endif
 }
 
+#endif
+
 static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
                             const uint8_t *const ref_array[4], int ref_stride,
                             uint32_t *res, const int height) {
@@ -233,6 +237,41 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
+static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
+                              uint32x4_t *const sum) {
+  const uint8x16_t r = vld1q_u8(ref_ptr);
+  const uint8x16_t diff = vabdq_u8(src_ptr, r);
+  *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1));
+}
+
+static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
+                             uint32_t *res, const int height) {
+  int i;
+  uint32x4_t r0, r1;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride);
+    sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]);
+    sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]);
+    sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]);
+    sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]);
+  }
+
+  r0 = vpaddq_u32(sum[0], sum[1]);
+  r1 = vpaddq_u32(sum[2], sum[3]);
+  vst1q_u32(res, vpaddq_u32(r0, r1));
+}
+
+#else
+
 static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
                               uint16x8_t *const sum) {
   const uint8x16_t r = vld1q_u8(ref_ptr);
@@ -266,6 +305,8 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
   sad_512_pel_final_neon(sum, res);
 }
 
+#endif
+
 void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
@@ -286,6 +327,67 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
+static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
+                             uint32_t *res, const int height) {
+  int i;
+  uint32x4_t r0, r1;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  for (i = 0; i < height; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  r0 = vpaddq_u32(sum[0], sum[1]);
+  r1 = vpaddq_u32(sum[2], sum[3]);
+  vst1q_u32(res, vpaddq_u32(r0, r1));
+}
+
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+}
+
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+}
+
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 64);
+}
+
+#else
+
 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
                              const int height, uint16x8_t *const sum) {
@@ -342,8 +444,118 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   sad_2048_pel_final_neon(sum, res);
 }
 
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint32x4_t r0, r1;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  for (i = 0; i < 32; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 2 * 16);
+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 3 * 16);
+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  r0 = vpaddq_u32(sum[0], sum[1]);
+  r1 = vpaddq_u32(sum[2], sum[3]);
+  vst1q_u32(res, vpaddq_u32(r0, r1));
+}
+
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint32x4_t r0, r1, r2, r3;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+  uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  for (i = 0; i < 64; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
+
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
+
+    s = vld1q_u8(src_ptr + 2 * 16);
+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
+
+    s = vld1q_u8(src_ptr + 3 * 16);
+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  r0 = vpaddq_u32(sum[0], sum[1]);
+  r1 = vpaddq_u32(sum[2], sum[3]);
+  r2 = vpaddq_u32(sum[4], sum[5]);
+  r3 = vpaddq_u32(sum[6], sum[7]);
+  r0 = vpaddq_u32(r0, r1);
+  r1 = vpaddq_u32(r2, r3);
+  vst1q_u32(res, vpaddq_u32(r0, r1));
+}
+
+#else
+
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
@@ -436,3 +648,5 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
 
   sad_4096_pel_final_neon(sum, res);
 }
+
+#endif

From c8b0432505d32820af0c42a94b219aa83eed5db9 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 11 May 2021 13:17:44 +0100
Subject: [PATCH 096/926] Implement Neon variance functions using UDOT
 instruction

Accelerate Neon variance functions by implementing the sum of squares
calculation using the Armv8.4-A UDOT instruction instead of 4 MLAs.

The previous implementation is retained for use on CPUs that do not
implement the Armv8.4-A dot product instructions.

Bug: b/181236880
Change-Id: I9ab3d52634278b9b6f0011f39390a1195210bc75
---
 vpx_dsp/arm/sum_neon.h      | 27 +++++++++++
 vpx_dsp/arm/variance_neon.c | 96 +++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+)

diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 630296237b..9a7c424e8e 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -40,6 +40,33 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
 #endif
 }
 
+static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_s32(a);
+#else
+  return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_s32(a);
+#else
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+#endif
+}
+
 static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
 #if defined(__aarch64__)
   return vaddvq_u32(a);
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index e08f31f2c5..19aaac7b69 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -19,6 +19,100 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
+#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+
+// Process a block of width 4 four rows at a time.
+static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride, int h,
+                               uint32_t *sse, int *sum) {
+  int i;
+  uint32x4_t sum_a = vdupq_n_u32(0);
+  uint32x4_t sum_b = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  for (i = 0; i < h; i += 4) {
+    const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride);
+
+    const uint8x16_t abs_diff = vabdq_u8(a, b);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
+    sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
+
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+  }
+
+  *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of any size where the width is divisible by 16.
+static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride, int w,
+                              int h, uint32_t *sse, int *sum) {
+  int i, j;
+  uint32x4_t sum_a = vdupq_n_u32(0);
+  uint32x4_t sum_b = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 16) {
+      const uint8x16_t a = vld1q_u8(src_ptr + j);
+      const uint8x16_t b = vld1q_u8(ref_ptr + j);
+
+      const uint8x16_t abs_diff = vabdq_u8(a, b);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
+      sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
+    }
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 8 two rows at a time.
+static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride, int h,
+                               uint32_t *sse, int *sum) {
+  int i = 0;
+  uint32x2_t sum_a = vdup_n_u32(0);
+  uint32x2_t sum_b = vdup_n_u32(0);
+  uint32x2_t sse_lo_u32 = vdup_n_u32(0);
+  uint32x2_t sse_hi_u32 = vdup_n_u32(0);
+
+  do {
+    const uint8x8_t a_0 = vld1_u8(src_ptr);
+    const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride);
+    const uint8x8_t b_0 = vld1_u8(ref_ptr);
+    const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride);
+
+    const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0);
+    const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1);
+    sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0);
+    sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1);
+
+    sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1));
+    sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1));
+    sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1));
+    sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1));
+
+    src_ptr += src_stride + src_stride;
+    ref_ptr += ref_stride + ref_stride;
+    i += 2;
+  } while (i < h);
+
+  *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b)));
+  *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
+}
+
+#else
+
 // The variance helper functions use int16_t for sum. 8 values are accumulated
 // and then added (at which point they expand up to int32_t). To avoid overflow,
 // there can be no more than 32767 / 255 ~= 128 values accumulated in each
@@ -160,6 +254,8 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
       vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
+#endif
+
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
                         unsigned int *sse, int *sum) {

From 231aa6ae32fca53efc45ffd39e14650346fcb030 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 20 Apr 2021 12:03:56 +0100
Subject: [PATCH 097/926] Implement vertical convolution using Neon SDOT
 instruction

Add an alternative AArch64 implementation of vpx_convolve8_vert_neon
for targets that implement the Armv8.4-A SDOT (signed dot product)
instruction.

The existing MLA-based implementation of vpx_convolve8_vert_neon is
retained and used on target CPUs that do not implement the SDOT
instruction (or CPUs executing in AArch32 mode). The availability of
the SDOT instruction is indicated by the feature macro
__ARM_FEATURE_DOTPROD.

Bug: b/181236880
Change-Id: Iebb8c77aba1d45b553b5112f3d87071fef3076f0
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 184 +++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index a86adb4e72..7394968169 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -704,6 +704,188 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, d01, d23;
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t d0, d1, d2, d3;
+
+    load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+    transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+    src += 8 * src_stride;
+
+    do {
+      load_u8_8x4(src, src_stride, &t8, &t9, &t10, &t11);
+      transpose_u8_8x4(&t8, &t9, &t10, &t11);
+      s0 = vcombine_u8(t0, t8);
+      s1 = vcombine_u8(t1, t9);
+      s2 = vcombine_u8(t2, t10);
+      s3 = vcombine_u8(t3, t11);
+
+      d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
+      d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
+      d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
+      d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      transpose_u8_4x4(&d01, &d23);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+
+      t0 = vext_u8(t0, t8, 4);
+      t1 = vext_u8(t1, t9, 4);
+      t2 = vext_u8(t2, t10, 4);
+      t3 = vext_u8(t3, t11, 4);
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else if (h == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, d04, d15, d26, d37;
+    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int32x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+    const uint8_t *s;
+    uint8_t *d;
+
+    do {
+      s = src;
+      d = dst;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s += 8 * src_stride;
+      t8 = vld1_u8(s);
+      s += src_stride;
+      t9 = vld1_u8(s);
+      s += src_stride;
+      t10 = vld1_u8(s);
+      s += src_stride;
+
+      transpose_u8_8x16(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
+                        vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0),
+                        vdup_n_u8(0), &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
+      d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
+      d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
+      d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+      d4 = convolve8_4_dot(s4, filters, correction, range_limit, permute_tbl);
+      d5 = convolve8_4_dot(s5, filters, correction, range_limit, permute_tbl);
+      d6 = convolve8_4_dot(s6, filters, correction, range_limit, permute_tbl);
+      d7 = convolve8_4_dot(s7, filters, correction, range_limit, permute_tbl);
+
+      d04 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d4)), 7);
+      d15 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d1), vqmovn_s32(d5)), 7);
+      d26 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d6)), 7);
+      d37 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d3), vqmovn_s32(d7)), 7);
+
+      transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+      vst1_u8(d, d04);
+      d += dst_stride;
+      vst1_u8(d, d15);
+      d += dst_stride;
+      vst1_u8(d, d26);
+      d += dst_stride;
+      vst1_u8(d, d37);
+      d += dst_stride;
+
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
+        t15, d0, d1, d2, d3, d4, d5, d6, d7;
+    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s += 8 * src_stride;
+
+      do {
+        load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                    &t15);
+        transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+        s0 = vcombine_u8(t0, t8);
+        s1 = vcombine_u8(t1, t9);
+        s2 = vcombine_u8(t2, t10);
+        s3 = vcombine_u8(t3, t11);
+        s4 = vcombine_u8(t4, t12);
+        s5 = vcombine_u8(t5, t13);
+        s6 = vcombine_u8(t6, t14);
+        s7 = vcombine_u8(t7, t15);
+
+        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
+        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
+        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
+        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+        d4 = convolve8_8_dot(s4, filters, correction, range_limit, permute_tbl);
+        d5 = convolve8_8_dot(s5, filters, correction, range_limit, permute_tbl);
+        d6 = convolve8_8_dot(s6, filters, correction, range_limit, permute_tbl);
+        d7 = convolve8_8_dot(s7, filters, correction, range_limit, permute_tbl);
+
+        transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+        store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        t0 = t8;
+        t1 = t9;
+        t2 = t10;
+        t3 = t11;
+        t4 = t12;
+        t5 = t13;
+        t6 = t14;
+        t7 = t15;
+        s += 8 * src_stride;
+        d += 8 * dst_stride;
+        height -= 8;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+#else
+
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const InterpKernel *filter, int x0_q4,
@@ -876,6 +1058,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+#endif
+
 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *filter, int x0_q4,

From 4808d831dbc4e9ff83fa0efe11207bc135c6d6f5 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 12 May 2021 16:05:56 +0100
Subject: [PATCH 098/926] Optimize remaining mse and sse functions in
 variance_neon.c

Implement sum of squared difference calculations in vpx_mse16x16_neon
and vpx_get4x4sse_cs_neon using the ABD and UDOT instructions -
instead of widening subtracts followed by a sequence of MLAs.

The existing implementation is retained for use on CPUs that do not
implement the Armv8.4-A UDOT instruction. This commit also updates
the variable names used in the existing implementations to be more
descriptive.

Bug: b/181236880
Change-Id: Id4ad8ea7c808af1ac9bb5f1b63327ab487e4b1c7
---
 vpx_dsp/arm/variance_neon.c | 218 ++++++++++++++++++++++--------------
 1 file changed, 133 insertions(+), 85 deletions(-)

diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 19aaac7b69..410ce7d9e6 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -357,117 +357,165 @@ unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride,
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
 }
 
+#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+
 unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
                                const unsigned char *ref_ptr, int ref_stride,
                                unsigned int *sse) {
   int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  int64x1_t d0s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  q7s32 = vdupq_n_s32(0);
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-    q0u8 = vld1q_u8(src_ptr);
+  uint8x16_t a[2], b[2], abs_diff[2];
+  uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  for (i = 0; i < 8; i++) {
+    a[0] = vld1q_u8(src_ptr);
     src_ptr += src_stride;
-    q1u8 = vld1q_u8(src_ptr);
+    a[1] = vld1q_u8(src_ptr);
     src_ptr += src_stride;
-    q2u8 = vld1q_u8(ref_ptr);
+    b[0] = vld1q_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q3u8 = vld1q_u8(ref_ptr);
+    b[1] = vld1q_u8(ref_ptr);
     ref_ptr += ref_stride;
 
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q10s32 = vaddq_s32(q7s32, q9s32);
+    abs_diff[0] = vabdq_u8(a[0], b[0]);
+    abs_diff[1] = vabdq_u8(a[1], b[1]);
 
-  q1s64 = vpaddlq_s32(q10s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+    sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]);
+    sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]);
+  }
 
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+  *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
+  return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
 }
 
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
                                    int ref_stride) {
-  int16x4_t d22s16, d24s16, d26s16, d28s16;
-  int64x1_t d0s64;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  d0u8 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d4u8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  d1u8 = vld1_u8(src_ptr);
+  uint8x8_t a[4], b[4], abs_diff[4];
+  uint32x2_t sse = vdup_n_u32(0);
+
+  a[0] = vld1_u8(src_ptr);
   src_ptr += src_stride;
-  d5u8 = vld1_u8(ref_ptr);
+  b[0] = vld1_u8(ref_ptr);
   ref_ptr += ref_stride;
-  d2u8 = vld1_u8(src_ptr);
+  a[1] = vld1_u8(src_ptr);
   src_ptr += src_stride;
-  d6u8 = vld1_u8(ref_ptr);
+  b[1] = vld1_u8(ref_ptr);
   ref_ptr += ref_stride;
-  d3u8 = vld1_u8(src_ptr);
+  a[2] = vld1_u8(src_ptr);
   src_ptr += src_stride;
-  d7u8 = vld1_u8(ref_ptr);
+  b[2] = vld1_u8(ref_ptr);
   ref_ptr += ref_stride;
+  a[3] = vld1_u8(src_ptr);
+  b[3] = vld1_u8(ref_ptr);
+
+  abs_diff[0] = vabd_u8(a[0], b[0]);
+  abs_diff[1] = vabd_u8(a[1], b[1]);
+  abs_diff[2] = vabd_u8(a[2], b[2]);
+  abs_diff[3] = vabd_u8(a[3], b[3]);
+
+  sse = vdot_u32(sse, abs_diff[0], abs_diff[0]);
+  sse = vdot_u32(sse, abs_diff[1], abs_diff[1]);
+  sse = vdot_u32(sse, abs_diff[2], abs_diff[2]);
+  sse = vdot_u32(sse, abs_diff[3], abs_diff[3]);
+
+  return vget_lane_u32(sse, 0);
+}
+
+#else
+
+unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
+                               const unsigned char *ref_ptr, int ref_stride,
+                               unsigned int *sse) {
+  int i;
+  uint8x16_t a[2], b[2];
+  int16x4_t diff_lo[4], diff_hi[4];
+  uint16x8_t diff[4];
+  int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                           vdupq_n_s32(0) };
+
+  for (i = 0; i < 8; i++) {
+    a[0] = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    a[1] = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    b[0] = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    b[1] = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0]));
+    diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0]));
+    diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1]));
+    diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1]));
+
+    diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+    diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+    sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]);
+    sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]);
+
+    diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
+    diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
+    sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]);
+    sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]);
+
+    diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+    diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+    sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]);
+    sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]);
+
+    diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
+    diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
+    sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]);
+    sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]);
+  }
+
+  sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]);
+  sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]);
+  sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]);
 
-  q11u16 = vsubl_u8(d0u8, d4u8);
-  q12u16 = vsubl_u8(d1u8, d5u8);
-  q13u16 = vsubl_u8(d2u8, d6u8);
-  q14u16 = vsubl_u8(d3u8, d7u8);
+  *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
+  return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
+}
+
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
+                                   const unsigned char *ref_ptr,
+                                   int ref_stride) {
+  uint8x8_t a[4], b[4];
+  int16x4_t diff_lo[4];
+  uint16x8_t diff[4];
+  int32x4_t sse;
 
-  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+  a[0] = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  b[0] = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  a[1] = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  b[1] = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  a[2] = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  b[2] = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  a[3] = vld1_u8(src_ptr);
+  b[3] = vld1_u8(ref_ptr);
 
-  q7s32 = vmull_s16(d22s16, d22s16);
-  q8s32 = vmull_s16(d24s16, d24s16);
-  q9s32 = vmull_s16(d26s16, d26s16);
-  q10s32 = vmull_s16(d28s16, d28s16);
+  diff[0] = vsubl_u8(a[0], b[0]);
+  diff[1] = vsubl_u8(a[1], b[1]);
+  diff[2] = vsubl_u8(a[2], b[2]);
+  diff[3] = vsubl_u8(a[3], b[3]);
 
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q9s32 = vaddq_s32(q7s32, q9s32);
+  diff_lo[0] = vget_low_s16(vreinterpretq_s16_u16(diff[0]));
+  diff_lo[1] = vget_low_s16(vreinterpretq_s16_u16(diff[1]));
+  diff_lo[2] = vget_low_s16(vreinterpretq_s16_u16(diff[2]));
+  diff_lo[3] = vget_low_s16(vreinterpretq_s16_u16(diff[3]));
 
-  q1s64 = vpaddlq_s32(q9s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+  sse = vmull_s16(diff_lo[0], diff_lo[0]);
+  sse = vmlal_s16(sse, diff_lo[1], diff_lo[1]);
+  sse = vmlal_s16(sse, diff_lo[2], diff_lo[2]);
+  sse = vmlal_s16(sse, diff_lo[3], diff_lo[3]);
 
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+  return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
 }
+
+#endif

From 66c1ff6850fd53bcf5c17247569bea1d700d6247 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 17 May 2021 10:53:07 +0100
Subject: [PATCH 099/926] Implement vpx_convolve8_avg_horiz_neon using SDOT
 instruction

Add an alternative AArch64 implementation of
vpx_convolve8_avg_horiz_neon for targets that implement the Armv8.4-A
SDOT (signed dot product) instruction.

The existing MLA-based implementation of vpx_convolve8_avg_horiz_neon
is retained and used on target CPUs that do not implement the SDOT
instruction (or CPUs executing in AArch32 mode). The availability of
the SDOT instruction is indicated by the feature macro
__ARM_FEATURE_DOTPROD.

Bug: b/181236880
Change-Id: Ib435107c47c485f325248da87ba5618d68b0c8ed
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 115 ++++++++++++++++++++++++++++++-
 1 file changed, 113 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 7394968169..acb128c855 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -157,6 +157,117 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23, dd01, dd23;
+      dd01 = vdup_n_u8(0);
+      dd23 = vdup_n_u8(0);
+
+      s0 = vld1q_u8(src);
+      src += src_stride;
+      s1 = vld1q_u8(src);
+      src += src_stride;
+      s2 = vld1q_u8(src);
+      src += src_stride;
+      s3 = vld1q_u8(src);
+      src += src_stride;
+
+      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
+        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
+        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
+        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+
+        dd0 = vld1_u8(d + 0 * dst_stride);
+        dd1 = vld1_u8(d + 1 * dst_stride);
+        dd2 = vld1_u8(d + 2 * dst_stride);
+        dd3 = vld1_u8(d + 3 * dst_stride);
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
 #else
 
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
@@ -412,8 +523,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif
-
 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *filter, int x0_q4,
@@ -704,6 +813,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+#endif
+
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
     (__ARM_FEATURE_DOTPROD == 1)
 

From 10823f54681747b9f64deb3002531c95cc67d17f Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 22 May 2021 22:07:25 +0100
Subject: [PATCH 100/926] Merge transpose and permute in Neon SDOT vertical
 convolution

The original dot-product implementation of vpx_convolve8_vert_neon
used a separate transpose before and after the convolution operation.
This patch merges the first transpose with the TBL permute (necessary
before using SDOT to compute the convolution) to significantly reduce
the amount of data re-arrangement. This new approach also allows for
more effective data re-use between loop iterations.

Co-authored by: James Greenhalgh <james.greenhalgh@arm.com>

Bug: b/181236880
Change-Id: I87fe4dadd312c3ad6216943b71a5410ddf4a1b5b
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 385 ++++++++++++++++++++-----------
 vpx_dsp/arm/vpx_convolve8_neon.h |  38 +++
 2 files changed, 282 insertions(+), 141 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index acb128c855..25a59a2da9 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,37 +31,72 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
-                                const uint8x8_t s0, const uint8x8_t s1,
-                                const uint8x8_t s2, const uint8x8_t s3,
-                                const uint8x8_t s4, const uint8x8_t s5,
-                                const uint8x8_t s6, const uint8x8_t s7) {
-  vst1_u8(s, s0);
-  s += p;
-  vst1_u8(s, s1);
-  s += p;
-  vst1_u8(s, s2);
-  s += p;
-  vst1_u8(s, s3);
-  s += p;
-  vst1_u8(s, s4);
-  s += p;
-  vst1_u8(s, s5);
-  s += p;
-  vst1_u8(s, s6);
-  s += p;
-  vst1_u8(s, s7);
-}
-
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
     (__ARM_FEATURE_DOTPROD == 1)
-
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
+                                        int8x8_t *a2, int8x8_t *a3,
+                                        int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
+                                        int8x8_t *a2, int8x8_t *a3,
+                                        int8x16_t *b0, int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -270,6 +305,28 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
 #else
 
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3,
+                                const uint8x8_t s4, const uint8x8_t s5,
+                                const uint8x8_t s6, const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -826,7 +883,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
   const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
   const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
 
   assert(!((intptr_t)dst & 3));
   assert(!(dst_stride & 3));
@@ -839,154 +900,196 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, d01, d23;
-    uint8x16_t s0, s1, s2, s3;
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
     int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
 
-    load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-    transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
-    src += 8 * src_stride;
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    src += 4 * src_stride;
+    t4 = vld1_u8(src);
+    src += src_stride;
+    t5 = vld1_u8(src);
+    src += src_stride;
+    t6 = vld1_u8(src);
+    src += src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
 
     do {
-      load_u8_8x4(src, src_stride, &t8, &t9, &t10, &t11);
-      transpose_u8_8x4(&t8, &t9, &t10, &t11);
-      s0 = vcombine_u8(t0, t8);
-      s1 = vcombine_u8(t1, t9);
-      s2 = vcombine_u8(t2, t10);
-      s3 = vcombine_u8(t3, t11);
-
-      d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
-      d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
-      d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
-      d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
-      transpose_u8_4x4(&d01, &d23);
 
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
       dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
       dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
       dst += dst_stride;
 
-      t0 = vext_u8(t0, t8, 4);
-      t1 = vext_u8(t1, t9, 4);
-      t2 = vext_u8(t2, t10, 4);
-      t3 = vext_u8(t3, t11, 4);
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
       src += 4 * src_stride;
       h -= 4;
     } while (h > 0);
-  } else if (h == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, d04, d15, d26, d37;
-    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
-    int32x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
     const uint8_t *s;
     uint8_t *d;
+    int height;
 
     do {
+      height = h;
       s = src;
       d = dst;
-      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      s += 8 * src_stride;
-      t8 = vld1_u8(s);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      s += 4 * src_stride;
+      t4 = vld1_u8(s);
       s += src_stride;
-      t9 = vld1_u8(s);
+      t5 = vld1_u8(s);
       s += src_stride;
-      t10 = vld1_u8(s);
+      t6 = vld1_u8(s);
       s += src_stride;
 
-      transpose_u8_8x16(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
-                        vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0),
-                        vdup_n_u8(0), &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-      d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
-      d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
-      d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
-      d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
-      d4 = convolve8_4_dot(s4, filters, correction, range_limit, permute_tbl);
-      d5 = convolve8_4_dot(s5, filters, correction, range_limit, permute_tbl);
-      d6 = convolve8_4_dot(s6, filters, correction, range_limit, permute_tbl);
-      d7 = convolve8_4_dot(s7, filters, correction, range_limit, permute_tbl);
-
-      d04 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d4)), 7);
-      d15 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d1), vqmovn_s32(d5)), 7);
-      d26 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d6)), 7);
-      d37 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d3), vqmovn_s32(d7)), 7);
-
-      transpose_u8_8x4(&d04, &d15, &d26, &d37);
-
-      vst1_u8(d, d04);
-      d += dst_stride;
-      vst1_u8(d, d15);
-      d += dst_stride;
-      vst1_u8(d, d26);
-      d += dst_stride;
-      vst1_u8(d, d37);
-      d += dst_stride;
-
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
-        t15, d0, d1, d2, d3, d4, d5, d6, d7;
-    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      s += 8 * src_stride;
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
 
       do {
-        load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
-                    &t15);
-        transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
-        s0 = vcombine_u8(t0, t8);
-        s1 = vcombine_u8(t1, t9);
-        s2 = vcombine_u8(t2, t10);
-        s3 = vcombine_u8(t3, t11);
-        s4 = vcombine_u8(t4, t12);
-        s5 = vcombine_u8(t5, t13);
-        s6 = vcombine_u8(t6, t14);
-        s7 = vcombine_u8(t7, t15);
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                     correction, filters);
+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                     correction, filters);
+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                     correction, filters);
+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                     correction, filters);
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
 
-        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
-        d4 = convolve8_8_dot(s4, filters, correction, range_limit, permute_tbl);
-        d5 = convolve8_8_dot(s5, filters, correction, range_limit, permute_tbl);
-        d6 = convolve8_8_dot(s6, filters, correction, range_limit, permute_tbl);
-        d7 = convolve8_8_dot(s7, filters, correction, range_limit, permute_tbl);
-
-        transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-        store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-        t0 = t8;
-        t1 = t9;
-        t2 = t10;
-        t3 = t11;
-        t4 = t12;
-        t5 = t13;
-        t6 = t14;
-        t7 = t15;
-        s += 8 * src_stride;
-        d += 8 * dst_stride;
-        height -= 8;
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
       } while (height > 0);
       src += 8;
       dst += 8;
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 14e7488540..857b6d54e2 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -75,6 +75,21 @@ static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
     (__ARM_FEATURE_DOTPROD == 1)
 
+static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
+                                                const int8x16_t samples_hi,
+                                                const int32x4_t correction,
+                                                const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
 static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
                                         const int8x8_t filters,
                                         const int32x4_t correction,
@@ -100,6 +115,29 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
   return sum;
 }
 
+static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
+                                                const int8x16_t samples0_hi,
+                                                const int8x16_t samples1_lo,
+                                                const int8x16_t samples1_hi,
+                                                const int32x4_t correction,
+                                                const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
+}
+
 static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
                                         const int8x8_t filters,
                                         const int32x4_t correction,

From 35bce9389ea875b57b352a0f5f532b96aa47bff6 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sun, 23 May 2021 13:35:15 +0100
Subject: [PATCH 101/926] Implement vpx_convolve8_avg_vert_neon using SDOT
 instruction

Add an alternative AArch64 implementation of
vpx_convolve8_avg_vert_neon for targets that implement the Armv8.4-A
SDOT (signed dot product) instruction.

The existing MLA-based implementation of vpx_convolve8_avg_vert_neon
is retained and used on target CPUs that do not implement the SDOT
instruction (or CPUs executing in AArch32 mode). The availability of
the SDOT instruction is indicated by the feature macro
__ARM_FEATURE_DOTPROD.

Bug: b/181236880
Change-Id: I971c626116155e1384bff4c76fd3420312c7a15b
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 695 ++++++++++++++++++++-----------
 1 file changed, 463 insertions(+), 232 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 25a59a2da9..06b58c438f 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -303,6 +303,467 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    src += 4 * src_stride;
+    t4 = vld1_u8(src);
+    src += src_stride;
+    t5 = vld1_u8(src);
+    src += src_stride;
+    t6 = vld1_u8(src);
+    src += src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      s += 4 * src_stride;
+      t4 = vld1_u8(s);
+      s += src_stride;
+      t5 = vld1_u8(s);
+      s += src_stride;
+      t6 = vld1_u8(s);
+      s += src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                     correction, filters);
+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                     correction, filters);
+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                     correction, filters);
+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                     correction, filters);
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    src += 4 * src_stride;
+    t4 = vld1_u8(src);
+    src += src_stride;
+    t5 = vld1_u8(src);
+    src += src_stride;
+    t6 = vld1_u8(src);
+    src += src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      s += 4 * src_stride;
+      t4 = vld1_u8(s);
+      s += src_stride;
+      t5 = vld1_u8(s);
+      s += src_stride;
+      t6 = vld1_u8(s);
+      s += src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                     correction, filters);
+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                     correction, filters);
+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                     correction, filters);
+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                     correction, filters);
+
+        dd0 = vld1_u8(d + 0 * dst_stride);
+        dd1 = vld1_u8(d + 1 * dst_stride);
+        dd2 = vld1_u8(d + 2 * dst_stride);
+        dd3 = vld1_u8(d + 3 * dst_stride);
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
 #else
 
 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
@@ -870,236 +1331,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
-
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    src += 4 * src_stride;
-    t4 = vld1_u8(src);
-    src += src_stride;
-    t5 = vld1_u8(src);
-    src += src_stride;
-    t6 = vld1_u8(src);
-    src += src_stride;
-
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
-
-    do {
-      uint8x8_t t7, t8, t9, t10;
-
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
-
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      s += 4 * src_stride;
-      t4 = vld1_u8(s);
-      s += src_stride;
-      t5 = vld1_u8(s);
-      s += src_stride;
-      t6 = vld1_u8(s);
-      s += src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                     correction, filters);
-        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                     correction, filters);
-        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                     correction, filters);
-        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                     correction, filters);
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-#else
-
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const InterpKernel *filter, int x0_q4,
@@ -1272,8 +1503,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif
-
 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *filter, int x0_q4,
@@ -1464,3 +1693,5 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     } while (w != 0);
   }
 }
+
+#endif

From dbda032fcfb323bfa74af52f86b26f337b0dc6be Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 24 May 2021 11:42:09 +0100
Subject: [PATCH 102/926] Use 'ptrdiff_t' instead of 'int' for pointer offset
 parameters

A number of the load/store functions in mem_neon.h use type 'int' for
the 'stride' pointer offset parameter. This causes Clang to generate
the following warning every time these functions are called with a
wider type passed in for 'stride':

warning: implicit conversion loses integer precision: 'ptrdiff_t'
(aka 'long') to 'int' [-Wshorten-64-to-32]

This patch changes all such instances of 'int' to 'ptrdiff_t'.

Bug: b/181236880
Change-Id: I2e86b005219e1fbb54f7cf2465e918b7c077f7ee
---
 vpx_dsp/arm/mem_neon.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index c89f92d1ad..50aaa94fe0 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -113,7 +113,8 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
 }
 
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
+                                          ptrdiff_t stride) {
   uint32_t a;
   uint32x2_t a_u32 = vdup_n_u32(0);
   if (stride == 4) return vld1_u8(buf);
@@ -126,7 +127,7 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
 }
 
 // Store 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
+static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
                                       const uint8x8_t a) {
   const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
   if (stride == 4) {
@@ -139,7 +140,8 @@ static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
 }
 
 // Load 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
+                                            ptrdiff_t stride) {
   uint32_t a;
   uint32x4_t a_u32 = vdupq_n_u32(0);
   if (stride == 4) return vld1q_u8(buf);
@@ -159,7 +161,7 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
 }
 
 // Store 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
+static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
                                        const uint8x16_t a) {
   const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
   if (stride == 4) {
@@ -176,7 +178,7 @@ static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
 }
 
 // Load 2 sets of 4 bytes when alignment is guaranteed.
-static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
+static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
   uint32x2_t a = vdup_n_u32(0);
 
   assert(!((intptr_t)buf % sizeof(uint32_t)));
@@ -189,7 +191,7 @@ static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
 }
 
 // Store 2 sets of 4 bytes when alignment is guaranteed.
-static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
+static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
   uint32x2_t a_u32 = vreinterpret_u32_u8(a);
 
   assert(!((intptr_t)buf % sizeof(uint32_t)));

From d42b93a15f182b0cd2bcc639e6951c40da6da8ce Mon Sep 17 00:00:00 2001
From: Chunbo Hua <chunbo.hua@intel.com>
Date: Wed, 26 May 2021 02:02:07 -0700
Subject: [PATCH 103/926] Fixed redundant wording for decoder algorithm
 interface

Change-Id: Id56e03dc9cf6d4e70c4681896f29893a9b4c76f2
---
 vpx/internal/vpx_codec_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 4ef93057f5..961b0bfe4c 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -283,7 +283,7 @@ typedef const struct vpx_codec_enc_cfg_map {
   vpx_codec_enc_cfg_t cfg;
 } vpx_codec_enc_cfg_map_t;
 
-/*!\brief Decoder algorithm interface interface
+/*!\brief Decoder algorithm interface
  *
  * All decoders \ref MUST expose a variable of this type.
  */

From b8273e8ae5c14bccefde96170507336a4f15c98c Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 24 May 2021 15:53:06 -0700
Subject: [PATCH 104/926] Fix simple encode

Properly init and delete cpi struct in simple encode functions.

Change-Id: I6e66bcac852cbb3dec9b754ba3fb01a348ac98b8
---
 vp9/encoder/vp9_firstpass.c |  6 ++---
 vp9/encoder/vp9_firstpass.h |  5 ++--
 vp9/simple_encode.cc        | 48 +++++++++++++++++++------------------
 3 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index b63d47a05e..aeb17aa6fa 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3878,9 +3878,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
 // coding frames (including show frame and alt ref) can be determined.
 int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
                              const TWO_PASS *const twopass,
-                             const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int multi_layer_arf, int allow_alt_ref) {
+                             const FRAME_INFO *frame_info, int multi_layer_arf,
+                             int allow_alt_ref) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int coding_frame_num = 0;
   RATE_CONTROL rc;
   int gop_coding_frame_count;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index cdbcb52412..e504528f15 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -305,9 +305,8 @@ int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf,
 
 int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf,
                              const TWO_PASS *const twopass,
-                             const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int multi_layer_arf, int allow_alt_ref);
+                             const FRAME_INFO *frame_info, int multi_layer_arf,
+                             int allow_alt_ref);
 
 /*!\brief Compute a key frame binary map indicates whether key frames appear
  * in the corresponding positions. The passed in key_frame_map must point to an
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index efdc71eb98..48551ef72f 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -872,14 +872,14 @@ void SimpleEncode::ComputeFirstPassStats() {
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list);
-  VP9_COMP *cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
-  struct lookahead_ctx *lookahead = cpi->lookahead;
+  impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
+  struct lookahead_ctx *lookahead = impl_ptr_->cpi->lookahead;
   int i;
   int use_highbitdepth = 0;
   const int num_rows_16x16 = get_num_unit_16x16(frame_height_);
   const int num_cols_16x16 = get_num_unit_16x16(frame_width_);
 #if CONFIG_VP9_HIGHBITDEPTH
-  use_highbitdepth = cpi->common.use_highbitdepth;
+  use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth;
 #endif
   vpx_image_t img;
   vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1);
@@ -905,30 +905,38 @@ void SimpleEncode::ComputeFirstPassStats() {
         ENCODE_FRAME_RESULT encode_frame_info;
         vp9_init_encode_frame_result(&encode_frame_info);
         // TODO(angiebird): Call vp9_first_pass directly
-        vp9_get_compressed_data(cpi, &frame_flags, &size, nullptr, &time_stamp,
-                                &time_end, flush, &encode_frame_info);
+        vp9_get_compressed_data(impl_ptr_->cpi, &frame_flags, &size, nullptr,
+                                &time_stamp, &time_end, flush,
+                                &encode_frame_info);
         // vp9_get_compressed_data only generates first pass stats not
         // compresses data
         assert(size == 0);
         // Get vp9 first pass motion vector info.
         std::vector<MotionVectorInfo> mv_info(num_rows_16x16 * num_cols_16x16);
-        update_motion_vector_info(cpi->fp_motion_vector_info, num_rows_16x16,
-                                  num_cols_16x16, mv_info.data(),
-                                  kMotionVectorFullPixelPrecision);
+        update_motion_vector_info(
+            impl_ptr_->cpi->fp_motion_vector_info, num_rows_16x16,
+            num_cols_16x16, mv_info.data(), kMotionVectorFullPixelPrecision);
         fp_motion_vector_info_.push_back(mv_info);
       }
-      impl_ptr_->first_pass_stats.push_back(vp9_get_frame_stats(&cpi->twopass));
+      impl_ptr_->first_pass_stats.push_back(
+          vp9_get_frame_stats(&impl_ptr_->cpi->twopass));
     }
   }
-  vp9_end_first_pass(cpi);
   // TODO(angiebird): Store the total_stats apart form first_pass_stats
-  impl_ptr_->first_pass_stats.push_back(vp9_get_total_stats(&cpi->twopass));
-  free_encoder(cpi);
-  rewind(in_file_);
-  vpx_img_free(&img);
+  impl_ptr_->first_pass_stats.push_back(
+      vp9_get_total_stats(&impl_ptr_->cpi->twopass));
+  vp9_end_first_pass(impl_ptr_->cpi);
+  fps_init_first_pass_info(&cpi->twopass.first_pass_info,
+                           GetVectorData(impl_ptr_->first_pass_stats),
+                           num_frames_);
 
   // Generate key_frame_map based on impl_ptr_->first_pass_stats.
   key_frame_map_ = ComputeKeyFrameMap();
+
+  free_encoder(impl_ptr_->cpi);
+  impl_ptr_->cpi = nullptr;
+  rewind(in_file_);
+  vpx_img_free(&img);
 }
 
 std::vector<std::vector<double>> SimpleEncode::ObserveFirstPassStats() {
@@ -1249,7 +1257,7 @@ int SimpleEncode::GetCodingFrameNum() const {
   }
 
   // These are the default settings for now.
-  const VP9_COMP *cpi = impl_ptr_->cpi;
+  VP9_COMP *cpi = impl_ptr_->cpi;
   const int multi_layer_arf = 0;
   const int allow_alt_ref = 1;
   vpx_rational_t frame_rate =
@@ -1258,13 +1266,11 @@ int SimpleEncode::GetCodingFrameNum() const {
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
   FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
-  FIRST_PASS_INFO first_pass_info;
-  fps_init_first_pass_info(&first_pass_info,
+  fps_init_first_pass_info(&cpi->twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
                            num_frames_);
   return vp9_get_coding_frame_num(&oxcf, &cpi->twopass, &frame_info,
-                                  &first_pass_info, multi_layer_arf,
-                                  allow_alt_ref);
+                                  multi_layer_arf, allow_alt_ref);
 }
 
 std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
@@ -1276,10 +1282,6 @@ std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
-  FIRST_PASS_INFO first_pass_info;
-  fps_init_first_pass_info(&first_pass_info,
-                           GetVectorData(impl_ptr_->first_pass_stats),
-                           num_frames_);
   std::vector<int> key_frame_map(num_frames_, 0);
   vp9_get_key_frame_map(&oxcf, &cpi->twopass, GetVectorData(key_frame_map));
   return key_frame_map;

From 463d33145de28770f815466db0ffc85d14442043 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Thu, 27 May 2021 15:38:28 -0700
Subject: [PATCH 105/926] L2E: properly init two pass rc parameters

Two pass rc parameters are only initialized in the second pass
in vp9 normal two pass encoding.
However, the simple_encode API queries the keyframe group, arf group,
and number of coding frames without going throught the two pass
route.
Since recent libvpx rc changes, parameters in the TWO_PASS
struct have a great influence on the determination of the above
information.
We therefore need to properly init two pass rc parameters in
the simple_encode related environment.

Change-Id: Ie14b86d6e7ebf171b638d2da24a7fdcf5a15c3d9
---
 vp9/encoder/vp9_firstpass.c |  6 +++---
 vp9/encoder/vp9_firstpass.h |  3 ++-
 vp9/simple_encode.cc        | 25 ++++++++++++++++---------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index aeb17aa6fa..28a22ded6d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3487,7 +3487,7 @@ static int is_skippable_frame(const VP9_COMP *cpi) {
 
 // Configure image size specific vizier parameters.
 // Later these will be set via additional command line options
-static void init_vizier_params(TWO_PASS *const twopass, int screen_area) {
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) {
   // When |use_vizier_rc_params| is 1, we expect the rc parameters below to
   // have been initialised on the command line as adjustment factors such
   // that a factor of 1.0 will match the default behavior when
@@ -3561,7 +3561,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   if (cm->current_video_frame == 0) {
     unsigned int screen_area = (cm->width * cm->height);
 
-    init_vizier_params(twopass, screen_area);
+    vp9_init_vizier_params(twopass, screen_area);
   }
 
   // If this is an arf frame then we dont want to read the stats file or
@@ -3877,7 +3877,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
 // Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of
 // coding frames (including show frame and alt ref) can be determined.
 int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
-                             const TWO_PASS *const twopass,
+                             TWO_PASS *const twopass,
                              const FRAME_INFO *frame_info, int multi_layer_arf,
                              int allow_alt_ref) {
   const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index e504528f15..ff0eb40c87 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -257,6 +257,7 @@ void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area);
 
 // Post encode update of the rate control parameters for 2-pass
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
@@ -304,7 +305,7 @@ int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf,
                                    int last_gop_use_alt_ref, int *use_alt_ref);
 
 int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf,
-                             const TWO_PASS *const twopass,
+                             TWO_PASS *const twopass,
                              const FRAME_INFO *frame_info, int multi_layer_arf,
                              int allow_alt_ref);
 
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 48551ef72f..7371eee8b9 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -926,9 +926,6 @@ void SimpleEncode::ComputeFirstPassStats() {
   impl_ptr_->first_pass_stats.push_back(
       vp9_get_total_stats(&impl_ptr_->cpi->twopass));
   vp9_end_first_pass(impl_ptr_->cpi);
-  fps_init_first_pass_info(&cpi->twopass.first_pass_info,
-                           GetVectorData(impl_ptr_->first_pass_stats),
-                           num_frames_);
 
   // Generate key_frame_map based on impl_ptr_->first_pass_stats.
   key_frame_map_ = ComputeKeyFrameMap();
@@ -1057,6 +1054,11 @@ void SimpleEncode::StartEncode() {
   frame_coding_index_ = 0;
   show_frame_count_ = 0;
 
+  assert(impl_ptr_->cpi != nullptr);
+  FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
+  unsigned int screen_area = frame_info.frame_width * frame_info.frame_height;
+  vp9_init_vizier_params(&impl_ptr_->cpi->twopass, screen_area);
+
   UpdateKeyFrameGroup(show_frame_count_);
 
   const GOP_COMMAND gop_command = GetGopCommand(gop_map_, show_frame_count_);
@@ -1257,7 +1259,7 @@ int SimpleEncode::GetCodingFrameNum() const {
   }
 
   // These are the default settings for now.
-  VP9_COMP *cpi = impl_ptr_->cpi;
+  TWO_PASS twopass;
   const int multi_layer_arf = 0;
   const int allow_alt_ref = 1;
   vpx_rational_t frame_rate =
@@ -1266,15 +1268,16 @@ int SimpleEncode::GetCodingFrameNum() const {
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
   FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
-  fps_init_first_pass_info(&cpi->twopass.first_pass_info,
+  fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
                            num_frames_);
-  return vp9_get_coding_frame_num(&oxcf, &cpi->twopass, &frame_info,
-                                  multi_layer_arf, allow_alt_ref);
+  unsigned int screen_area = frame_info.frame_width * frame_info.frame_height;
+  vp9_init_vizier_params(&twopass, screen_area);
+  return vp9_get_coding_frame_num(&oxcf, &twopass, &frame_info, multi_layer_arf,
+                                  allow_alt_ref);
 }
 
 std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
-  const VP9_COMP *cpi = impl_ptr_->cpi;
   // The last entry of first_pass_stats is the overall stats.
   assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1);
   vpx_rational_t frame_rate =
@@ -1282,8 +1285,12 @@ std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+  TWO_PASS twopass;
+  fps_init_first_pass_info(&twopass.first_pass_info,
+                           GetVectorData(impl_ptr_->first_pass_stats),
+                           num_frames_);
   std::vector<int> key_frame_map(num_frames_, 0);
-  vp9_get_key_frame_map(&oxcf, &cpi->twopass, GetVectorData(key_frame_map));
+  vp9_get_key_frame_map(&oxcf, &twopass, GetVectorData(key_frame_map));
   return key_frame_map;
 }
 

From 71d09c34fff6f51a153b8732eef6bfb4e381fcbf Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 5 Jun 2021 19:30:04 -0700
Subject: [PATCH 106/926] simple_encode_test: fix input file path

this allows the file to be located in LIBVPX_TEST_DATA_PATH similar to
other test sources.

Bug: webm:1731
Change-Id: I51606635d91871e7c179aa8d20d4841b0d60b6ad
---
 test/simple_encode_test.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc
index ab893045d8..03e28e3387 100644
--- a/test/simple_encode_test.cc
+++ b/test/simple_encode_test.cc
@@ -13,6 +13,7 @@
 #include <string>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/video_source.h"
 #include "vp9/simple_encode.h"
 
 namespace vp9 {
@@ -36,7 +37,8 @@ class SimpleEncodeTest : public ::testing::Test {
   const int frame_rate_den_ = 1;
   const int target_bitrate_ = 1000;
   const int num_frames_ = 17;
-  const std::string in_file_path_str_ = "bus_352x288_420_f20_b8.yuv";
+  const std::string in_file_path_str_ =
+      libvpx_test::GetDataPath() + "/bus_352x288_420_f20_b8.yuv";
 };
 
 TEST_F(SimpleEncodeTest, ComputeFirstPassStats) {

From 5d678fe78a5a0ece72cddbf7d7071ef8dc3598dc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 9 Jun 2021 15:07:15 -0700
Subject: [PATCH 107/926] simple_encode: fix some -Wsign-compare warnings

Bug: webm:1731
Change-Id: I1db777c0c3a8784fb3dcf7cd39f78ebf833ab915
---
 vp9/simple_encode.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 7371eee8b9..8ff5ad3c98 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -1248,7 +1248,7 @@ static int GetCodingFrameNumFromGopMap(const std::vector<int> &gop_map) {
     start_show_index += gop_command.show_frame_count;
     coding_frame_count += gop_command_coding_frame_count(&gop_command);
   }
-  assert(start_show_index == gop_map.size());
+  assert(static_cast<size_t>(start_show_index) == gop_map.size());
   return coding_frame_count;
 }
 
@@ -1279,7 +1279,8 @@ int SimpleEncode::GetCodingFrameNum() const {
 
 std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
   // The last entry of first_pass_stats is the overall stats.
-  assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1);
+  assert(impl_ptr_->first_pass_stats.size() ==
+         static_cast<size_t>(num_frames_) + 1);
   vpx_rational_t frame_rate =
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   const VP9EncoderConfig oxcf = GetEncodeConfig(

From d85c54d4e870e979062e275a1a58a3a44f64e601 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 11 Jun 2021 16:34:41 -0700
Subject: [PATCH 108/926] Update some comments for rc_target_bitrate

this mirrors the change from libaom:
5b150b150 Update some comments for rc_target_bitrate

Change-Id: Iaabee5924e0320609a29dc8ab71327923fb4c5d2
---
 vp8/vp8_cx_iface.c | 2 +-
 vpx/vpx_encoder.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 8d3044f6a4..78631e7976 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1276,7 +1276,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         VPX_VBR,     /* rc_end_usage */
         { NULL, 0 }, /* rc_twopass_stats_in */
         { NULL, 0 }, /* rc_firstpass_mb_stats_in */
-        256,         /* rc_target_bandwidth */
+        256,         /* rc_target_bitrate */
         4,           /* rc_min_quantizer */
         63,          /* rc_max_quantizer */
         100,         /* rc_undershoot_pct */
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index f8fdfc0307..21254bb547 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -457,7 +457,7 @@ typedef struct vpx_codec_enc_cfg {
 
   /*!\brief Target data rate
    *
-   * Target bandwidth to use for this stream, in kilobits per second.
+   * Target bitrate to use for this stream, in kilobits per second.
    */
   unsigned int rc_target_bitrate;
 

From 9a25e3169b59ca822558024423c5675790ffcf5b Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 14 Jun 2021 15:02:52 -0700
Subject: [PATCH 109/926] vp9-rtc: Refactor 1 pass vbr rate control

This refactoring is needed to allow the
RC_rtc library to support VBR.

Change-Id: I863a4a65096fed06b02307098febf7976360e0f3
---
 vp9/encoder/vp9_ratectrl.c | 63 ++++++++++++++++++++++----------------
 vp9/encoder/vp9_ratectrl.h |  3 ++
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 4b87ff2f0c..b89166466c 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2009,7 +2009,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
   }
 }
 
-static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const int af_ratio = rc->af_ratio_onepass_vbr;
   int64_t target =
@@ -2024,7 +2024,7 @@ static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
   return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
 }
 
-static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
   static const int kf_ratio = 25;
   const RATE_CONTROL *rc = &cpi->rc;
   const int target = rc->avg_frame_bandwidth * kf_ratio;
@@ -2050,22 +2050,9 @@ static void adjust_gfint_frame_constraint(VP9_COMP *cpi, int frame_constraint) {
   }
 }
 
-void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  int target;
-  if (!cpi->refresh_alt_ref_frame &&
-      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0)) {
-    cm->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        cm->current_video_frame != 0 && rc->frames_to_key == 0;
-    rc->frames_to_key = cpi->oxcf.key_freq;
-    rc->kf_boost = DEFAULT_KF_BOOST;
-    rc->source_alt_ref_active = 0;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
+  VP9_COMMON *const cm = &cpi->common;
   if (rc->frames_till_gf_update_due == 0) {
     double rate_err = 1.0;
     rc->gfu_boost = DEFAULT_GF_BOOST;
@@ -2084,15 +2071,18 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
           rate_err > 3.5) {
         rc->baseline_gf_interval =
             VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
-      } else if (rc->avg_frame_low_motion < 20) {
+      } else if (rc->avg_frame_low_motion > 0 &&
+                 rc->avg_frame_low_motion < 20) {
         // Decrease gf interval for high motion case.
         rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1);
       }
-      // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
-      // between 0 and 100 (stationary, 100% zero/small motion).
-      rc->gfu_boost =
-          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                          (rc->avg_frame_low_motion + 100));
+      if (rc->avg_frame_low_motion > 0) {
+        // Adjust boost and af_ratio based on avg_frame_low_motion, which
+        // varies between 0 and 100 (stationary, 100% zero/small motion).
+        rc->gfu_boost =
+            VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                            (rc->avg_frame_low_motion + 100));
+      }
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
     adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
@@ -2105,10 +2095,29 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
       rc->alt_ref_gf_group = 1;
     }
   }
+}
+
+void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0)) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_video_frame != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  vp9_set_gf_update_one_pass_vbr(cpi);
   if (cm->frame_type == KEY_FRAME)
-    target = calc_iframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
   else
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
   vp9_rc_set_frame_target(cpi, target);
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0)
     vp9_cyclic_refresh_update_parameters(cpi);
@@ -2953,7 +2962,7 @@ static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
         }
       }
     }
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
     vp9_rc_set_frame_target(cpi, target);
   }
   rc->prev_avg_source_sad_lag = avg_source_sad_lag;
@@ -3163,7 +3172,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
           VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
       adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
       rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-      target = calc_pframe_target_size_one_pass_vbr(cpi);
+      target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
       vp9_rc_set_frame_target(cpi, target);
       rc->count_last_scene_change = 0;
     } else {
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 0120f90a01..8ef10c94a3 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -255,6 +255,9 @@ void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi);
 void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi);
 int vp9_calc_pframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
 int vp9_calc_iframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
+int vp9_calc_pframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+int vp9_calc_iframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+void vp9_set_gf_update_one_pass_vbr(struct VP9_COMP *const cpi);
 void vp9_update_buffer_level_preencode(struct VP9_COMP *cpi);
 void vp9_rc_get_svc_params(struct VP9_COMP *cpi);
 

From a945f344e04d2851cd675cca48182cca2e7d8a4e Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 15 Jun 2021 14:55:29 -0700
Subject: [PATCH 110/926] Change the data path in svc rate control test

Change-Id: Iba58e2aa2578964b5c8b48ab0acbee9b44bcdada
---
 test/ratectrl_rtc_test.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 3a1c8469ee..58bfac3a7a 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -115,8 +115,7 @@ class RcInterfaceTest : public ::testing::Test {
     libvpx::VP9FrameParamsQpRTC frame_params;
     frame_params.frame_type = KEY_FRAME;
     std::ifstream svc_file;
-    svc_file.open(std::string(std::getenv("LIBVPX_TEST_DATA_PATH")) +
-                  "/rc_interface_test_svc");
+    svc_file.open(libvpx_test::GetDataPath() + "/rc_interface_test_svc");
     ASSERT_TRUE(svc_file.good());
     for (size_t i = 0; i < kNumFrame * rc_cfg_.ss_number_layers; i++) {
       svc_file >> frame_info;

From 364f0e31fed78be436d00c177574dac00c0d85a4 Mon Sep 17 00:00:00 2001
From: Chunbo Hua <chunbo.hua@intel.com>
Date: Wed, 16 Jun 2021 01:51:44 -0700
Subject: [PATCH 111/926] Initialize VP9EncoderConfig profile and bit depth

Change-Id: I5c42013a08677cdef8d47f348458118338ff0138
---
 vp9/ratectrl_rtc.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 47f9f3ba33..8f77fc842d 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -39,6 +39,8 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   cm->bit_depth = VPX_BITS_8;
   cm->show_frame = 1;
   oxcf->rc_mode = VPX_CBR;
+  oxcf->profile = cm->profile;
+  oxcf->bit_depth = cm->bit_depth;
   oxcf->pass = 0;
   oxcf->aq_mode = NO_AQ;
   oxcf->content = VP9E_CONTENT_DEFAULT;

From 338013712e516d07388651437918e6328ea909f5 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 17 Jun 2021 12:00:33 -0700
Subject: [PATCH 112/926] vp9: Adjust logic for gf update in 1 pass vbr

This reduces some regression when external RC
is used, for which avg_frame_low_motion is not
set/updated (=0).

Change-Id: I2408e62bd97592e892cefa0f183357c641aa5eea
---
 vp9/encoder/vp9_ratectrl.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index b89166466c..3775d22361 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2076,13 +2076,14 @@ void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) {
         // Decrease gf interval for high motion case.
         rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1);
       }
-      if (rc->avg_frame_low_motion > 0) {
-        // Adjust boost and af_ratio based on avg_frame_low_motion, which
-        // varies between 0 and 100 (stationary, 100% zero/small motion).
+      // Adjust boost and af_ratio based on avg_frame_low_motion, which
+      // varies between 0 and 100 (stationary, 100% zero/small motion).
+      if (rc->avg_frame_low_motion > 0)
         rc->gfu_boost =
             VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
                             (rc->avg_frame_low_motion + 100));
-      }
+      else if (rc->avg_frame_low_motion == 0 && rate_err > 1.0)
+        rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
     adjust_gfint_frame_constraint(cpi, rc->frames_to_key);

From 2380e13da8a5bba3e8afdb14e0aa61fd980a49c9 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 18 Jun 2021 11:56:27 -0700
Subject: [PATCH 113/926] normalize vp9_calc_[ip]frame declarations and
 definitions

fixes warnings under visual studio:

vp9\encoder\vp9_ratectrl.c(2012): warning C4028: formal parameter 1
different from declaration
vp9\encoder\vp9_ratectrl.c(2027): warning C4028: formal parameter 1
different from declaration

Change-Id: Ia0740db597fb7a259f90d362b483f58662f9f584
---
 vp9/encoder/vp9_ratectrl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 3775d22361..51fb2aab8f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2009,7 +2009,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
   }
 }
 
-int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const int af_ratio = rc->af_ratio_onepass_vbr;
   int64_t target =
@@ -2024,7 +2024,7 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
   return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
 }
 
-int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   static const int kf_ratio = 25;
   const RATE_CONTROL *rc = &cpi->rc;
   const int target = rc->avg_frame_bandwidth * kf_ratio;

From 1f45e7b07ec839dae7a90455e00c3b2d553ea772 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 15 Jun 2021 12:54:13 -0700
Subject: [PATCH 114/926] vp9 rc: add vbr to rtc rate control library

Change-Id: I3d2565572c2b905966d60bcaa6e5e6f057b1bd51
---
 test/ratectrl_rtc_test.cc  | 104 +++++++++++++++++++++++++++++++++----
 test/test-data.mk          |   2 +
 vp9/encoder/vp9_ratectrl.c |   3 --
 vp9/encoder/vp9_ratectrl.h |   3 ++
 vp9/ratectrl_rtc.cc        |  31 ++++++++---
 vp9/ratectrl_rtc.h         |   4 ++
 6 files changed, 126 insertions(+), 21 deletions(-)

diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 58bfac3a7a..e9a9f15e9f 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -32,6 +32,7 @@ struct FrameInfo {
         info.bytes_used;
     return is;
   }
+
   int frame_id;
   int spatial_id;
   int temporal_id;
@@ -48,24 +49,32 @@ struct FrameInfo {
 // This test runs the rate control interface and compare against ground truth
 // generated by encoders.
 // Settings for the encoder:
-// For 1 layer:
+// For 1 layer CBR:
+// - AQ_Mode 0
+// - Disable golden refresh
+// - Bitrate x 2 at frame/superframe 200
+// - Bitrate / 4 at frame/superframe 400
+// examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9
+//    1280 720 1 30 7 0 0 1 0 1000
 //
+// For 1 layer VBR:
+//  - Set rc_end_usage to VPX_VBR
+//  - AQ Mode 0
+//  - Disable vp9_compute_frame_low_motion in vp9_encoder.c
 // examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9
 //    1280 720 1 30 7 0 0 1 0 1000
 //
 // For SVC (3 temporal layers, 3 spatial layers):
-//
+// - AQ_Mode 0
+// - Disable golden refresh
+// - Bitrate x 2 at frame/superframe 200
+// - Bitrate / 4 at frame/superframe 400
 // examples/vp9_spatial_svc_encoder -f 10000 -w 1280 -h 720 -t 1/30 -sl 3
 // -k 10000 -bl 100,140,200,250,350,500,450,630,900 -b 1600 --rc-end-usage=1
 // --lag-in-frames=0 --passes=1 --speed=7 --threads=1
 // --temporal-layering-mode=3 -aq 1 -rcstat 1
 // gipsrec_motion1.1280_720.yuv -o out.webm
 //
-// - AQ_Mode 0
-// - Disable golden refresh
-// - Bitrate x 2 at frame/superframe 200
-// - Bitrate / 4 at frame/superframe 400
-//
 // The generated file includes:
 // frame number, spatial layer ID, temporal layer ID, base QP, target
 // bandwidth, buffer level, loopfilter level, encoded frame size
@@ -77,8 +86,8 @@ class RcInterfaceTest : public ::testing::Test {
   virtual ~RcInterfaceTest() {}
 
  protected:
-  void RunOneLayer() {
-    SetConfigOneLayer();
+  void RunOneLayerCBR() {
+    SetConfigOneLayerCBR();
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
     FrameInfo frame_info;
     libvpx::VP9FrameParamsQpRTC frame_params;
@@ -144,8 +153,58 @@ class RcInterfaceTest : public ::testing::Test {
     }
   }
 
+  void RunOneLayerVBR() {
+    SetConfigOneLayerVBR();
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    FrameInfo frame_info;
+    libvpx::VP9FrameParamsQpRTC frame_params;
+    frame_params.frame_type = KEY_FRAME;
+    frame_params.spatial_layer_id = 0;
+    frame_params.temporal_layer_id = 0;
+    std::ifstream one_layer_file;
+    one_layer_file.open(libvpx_test::GetDataPath() +
+                        "/rc_interface_test_one_layer_vbr");
+    ASSERT_TRUE(one_layer_file.good());
+    for (size_t i = 0; i < kNumFrame; i++) {
+      one_layer_file >> frame_info;
+      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
+      ASSERT_EQ(frame_info.spatial_id, 0);
+      ASSERT_EQ(frame_info.temporal_id, 0);
+      rc_api_->ComputeQP(frame_params);
+      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
+      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
+    }
+  }
+
+  void RunOneLayerVBRPeriodicKey() {
+    SetConfigOneLayerVBRPeriodicKey();
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    FrameInfo frame_info;
+    libvpx::VP9FrameParamsQpRTC frame_params;
+    frame_params.frame_type = KEY_FRAME;
+    frame_params.spatial_layer_id = 0;
+    frame_params.temporal_layer_id = 0;
+    std::ifstream one_layer_file;
+    one_layer_file.open(libvpx_test::GetDataPath() +
+                        "/rc_interface_test_one_layer_vbr_periodic_key");
+    ASSERT_TRUE(one_layer_file.good());
+    for (size_t i = 0; i < kNumFrame; i++) {
+      one_layer_file >> frame_info;
+      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
+      if (frame_info.frame_id % rc_cfg_.key_freq == 0)
+        frame_params.frame_type = KEY_FRAME;
+      ASSERT_EQ(frame_info.spatial_id, 0);
+      ASSERT_EQ(frame_info.temporal_id, 0);
+      rc_api_->ComputeQP(frame_params);
+      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
+      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
+    }
+  }
+
  private:
-  void SetConfigOneLayer() {
+  void SetConfig() {
     rc_cfg_.width = 1280;
     rc_cfg_.height = 720;
     rc_cfg_.max_quantizer = 52;
@@ -167,6 +226,24 @@ class RcInterfaceTest : public ::testing::Test {
     rc_cfg_.min_quantizers[0] = 2;
   }
 
+  void SetConfigOneLayerCBR() {
+    SetConfig();
+    rc_cfg_.rc_mode = VPX_CBR;
+    rc_cfg_.key_freq = 3000;
+  }
+
+  void SetConfigOneLayerVBR() {
+    SetConfig();
+    rc_cfg_.rc_mode = VPX_VBR;
+    rc_cfg_.key_freq = 3000;
+  }
+
+  void SetConfigOneLayerVBRPeriodicKey() {
+    SetConfig();
+    rc_cfg_.rc_mode = VPX_VBR;
+    rc_cfg_.key_freq = 300;
+  }
+
   void SetConfigSVC() {
     rc_cfg_.width = 1280;
     rc_cfg_.height = 720;
@@ -182,6 +259,7 @@ class RcInterfaceTest : public ::testing::Test {
     rc_cfg_.framerate = 30.0;
     rc_cfg_.ss_number_layers = 3;
     rc_cfg_.ts_number_layers = 3;
+    rc_cfg_.rc_mode = VPX_CBR;
 
     rc_cfg_.scaling_factor_num[0] = 1;
     rc_cfg_.scaling_factor_den[0] = 4;
@@ -217,7 +295,11 @@ class RcInterfaceTest : public ::testing::Test {
   libvpx::VP9RateControlRtcConfig rc_cfg_;
 };
 
-TEST_F(RcInterfaceTest, OneLayer) { RunOneLayer(); }
+TEST_F(RcInterfaceTest, OneLayerCBR) { RunOneLayerCBR(); }
+
+TEST_F(RcInterfaceTest, OneLayerVBR) { RunOneLayerVBR(); }
+
+TEST_F(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
 TEST_F(RcInterfaceTest, SVC) { RunSVC(); }
 }  // namespace
diff --git a/test/test-data.mk b/test/test-data.mk
index 744901b115..379fc6e7a9 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -28,6 +28,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer_vbr
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer_vbr_periodic_key
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_svc
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 3775d22361..dbbd458c96 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -39,9 +39,6 @@
 #define MAX_MB_RATE 250
 #define MAXRATE_1080P 4000000
 
-#define DEFAULT_KF_BOOST 2000
-#define DEFAULT_GF_BOOST 2000
-
 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
 #define MIN_BPB_FACTOR 0.005
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 8ef10c94a3..bdddd2df8b 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -27,6 +27,9 @@ extern "C" {
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 16
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 8f77fc842d..8455ca9a3d 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -38,13 +38,16 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   cm->profile = PROFILE_0;
   cm->bit_depth = VPX_BITS_8;
   cm->show_frame = 1;
-  oxcf->rc_mode = VPX_CBR;
   oxcf->profile = cm->profile;
   oxcf->bit_depth = cm->bit_depth;
+  oxcf->rc_mode = rc_cfg.rc_mode;
   oxcf->pass = 0;
   oxcf->aq_mode = NO_AQ;
   oxcf->content = VP9E_CONTENT_DEFAULT;
   oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  oxcf->key_freq = rc_cfg.key_freq;
+  rc->kf_boost = DEFAULT_KF_BOOST;
 
   UpdateRateControl(rc_cfg);
 
@@ -57,8 +60,8 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   rc->rc_2_frame = 0;
   vp9_rc_init_minq_luts();
   vp9_rc_init(oxcf, 0, rc);
+  rc->frames_to_key = oxcf->key_freq;
   cpi_->sf.use_nonrd_pick_mode = 1;
-  cm->current_video_frame = 0;
 }
 
 void VP9RateControlRTC::UpdateRateControl(
@@ -75,6 +78,7 @@ void VP9RateControlRTC::UpdateRateControl(
   oxcf->best_allowed_q = vp9_quantizer_to_qindex(rc_cfg.min_quantizer);
   rc->worst_quality = oxcf->worst_allowed_q;
   rc->best_quality = oxcf->best_allowed_q;
+  oxcf->init_framerate = rc_cfg.framerate;
   oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth;
   oxcf->starting_buffer_level_ms = rc_cfg.buf_initial_sz;
   oxcf->optimal_buffer_level_ms = rc_cfg.buf_optimal_sz;
@@ -140,11 +144,24 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
   cpi_->sf.use_nonrd_pick_mode = 1;
   if (cpi_->svc.number_spatial_layers == 1 &&
       cpi_->svc.number_temporal_layers == 1) {
-    int target;
-    if (frame_is_intra_only(cm))
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_);
-    else
-      target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_);
+    int target = 0;
+    if (cpi_->oxcf.rc_mode == VPX_CBR) {
+      if (frame_is_intra_only(cm))
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_);
+      else
+        target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_);
+    } else if (cpi_->oxcf.rc_mode == VPX_VBR) {
+      if (cm->frame_type == KEY_FRAME) {
+        cpi_->rc.this_key_frame_forced =
+            cm->current_video_frame != 0 && cpi_->rc.frames_to_key == 0;
+        cpi_->rc.frames_to_key = cpi_->oxcf.key_freq;
+      }
+      vp9_set_gf_update_one_pass_vbr(cpi_);
+      if (frame_is_intra_only(cm))
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_);
+      else
+        target = vp9_calc_pframe_target_size_one_pass_vbr(cpi_);
+    }
     vp9_rc_set_frame_target(cpi_, target);
     vp9_update_buffer_level_preencode(cpi_);
   } else {
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index 72ea40fd68..a1f2767126 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -49,6 +49,10 @@ struct VP9RateControlRtcConfig {
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
   int layer_target_bitrate[VPX_MAX_LAYERS];
   int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+  // vbr, cbr
+  enum vpx_rc_mode rc_mode;
+  // key frame frequency
+  int key_freq;
 };
 
 struct VP9FrameParamsQpRTC {

From 9873d61b252c4bada8515a4fd8df8377cda012f1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 21 Jun 2021 13:33:44 -0700
Subject: [PATCH 115/926] test-data.sha1: add missing sha sums

for rc_interface_test_one_layer_vbr and
rc_interface_test_one_layer_vbr_periodic_key added in:
1f45e7b07 vp9 rc: add vbr to rtc rate control library

Change-Id: I8bfa3698284c8ff289e830f7b8fa1ca42b752563
---
 test/test-data.sha1 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 6f9021554d..bcf9612fba 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -871,3 +871,5 @@ d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
 ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
 03f827c0e36ff9a6e23c5cc11936924e4f1827ab *rc_interface_test_one_layer
 99e4f4c2961d46dc286db230090a39d78460b25d *rc_interface_test_svc
+9dcaafd91bc61ed360c23616b4788437b9f9b96b *rc_interface_test_one_layer_vbr
+babd17cca2e93cc74753c6ed80de87457bc3a5f3 *rc_interface_test_one_layer_vbr_periodic_key

From a1fdfbb174487e5efb76e6e77119d2e50840086e Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Fri, 18 Jun 2021 16:09:41 -0700
Subject: [PATCH 116/926] Fix flaky assertions in SimpleEncode

Bug: webm:1731

Change-Id: Ieecb98a7ac19e6291acd5d51432dc6a3789e9552
---
 vp9/encoder/vp9_firstpass.c | 2 +-
 vp9/simple_encode.cc        | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 28a22ded6d..375438839b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3836,7 +3836,7 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame,
   if (gop_command->use) {
     *coding_frame_count = gop_command_coding_frame_count(gop_command);
     *use_alt_ref = gop_command->use_alt_ref;
-    assert(*coding_frame_count < rc.frames_to_key);
+    assert(gop_command->show_frame_count <= rc.frames_to_key);
   } else {
     *coding_frame_count = vp9_get_gop_coding_frame_count(
         &cpi->oxcf, &cpi->twopass, &cpi->frame_info, &rc, *first_show_idx,
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 8ff5ad3c98..87727cb12a 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -1009,8 +1009,7 @@ T *GetVectorData(const std::vector<T> &v) {
 static GOP_COMMAND GetGopCommand(const std::vector<int> &gop_map,
                                  int start_show_index) {
   GOP_COMMAND gop_command;
-  if (gop_map.size() > 0) {
-    assert(static_cast<size_t>(start_show_index) < gop_map.size());
+  if (static_cast<size_t>(start_show_index) < gop_map.size()) {
     assert((gop_map[start_show_index] & kGopMapFlagStart) != 0);
     int end_show_index = start_show_index + 1;
     // gop_map[end_show_index] & kGopMapFlagStart == 0 means this is

From 0bb7bb6df8c2dcce22d0151676183f589bcf27a1 Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Thu, 17 Jun 2021 20:23:30 -0700
Subject: [PATCH 117/926] Add use_simple_encode_api to oxcf

Use this flag to change the encoder behavior when
SimpleEncode APIs are used

BUG=webm:1733

Change-Id: I9f0852a03ff99faa01cdd8eee8ab71718cc58632
---
 vp9/encoder/vp9_encodeframe.c |  37 ++--
 vp9/encoder/vp9_encoder.c     | 394 ++++++++++++----------------------
 vp9/encoder/vp9_encoder.h     |   1 +
 vp9/encoder/vp9_firstpass.c   | 106 +++++----
 vp9/encoder/vp9_firstpass.h   |   2 +-
 vp9/encoder/vp9_ratectrl.c    |  29 +--
 vp9/simple_encode.cc          |   1 +
 vp9/vp9_cx_iface.c            |   3 +
 8 files changed, 248 insertions(+), 325 deletions(-)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index dcd6476581..00855319d6 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -4603,15 +4603,18 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
 #if CONFIG_RATE_CTRL
-    // Store partition, motion vector of the superblock.
-    if (output_enabled) {
-      const int num_unit_rows = get_num_unit_4x4(cpi->frame_info.frame_height);
-      const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width);
-      store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride,
-                            num_4x4_blocks_wide_lookup[BLOCK_64X64],
-                            num_unit_rows, num_unit_cols, mi_row << 1,
-                            mi_col << 1, cpi->partition_info,
-                            cpi->motion_vector_info);
+    if (oxcf->use_simple_encode_api) {
+      // Store partition, motion vector of the superblock.
+      if (output_enabled) {
+        const int num_unit_rows =
+            get_num_unit_4x4(cpi->frame_info.frame_height);
+        const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width);
+        store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride,
+                              num_4x4_blocks_wide_lookup[BLOCK_64X64],
+                              num_unit_rows, num_unit_cols, mi_row << 1,
+                              mi_col << 1, cpi->partition_info,
+                              cpi->motion_vector_info);
+      }
     }
 #endif  // CONFIG_RATE_CTRL
   }
@@ -5981,9 +5984,14 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#if CONFIG_RATE_CTRL
+            if (cpi->oxcf.use_simple_encode_api) {
+              tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
+            }
+#endif  // CONFIG_RATE_CTRL
+#if CONFIG_CONSISTENT_RECODE
             tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#endif  // CONFIG_CONSISTENT_RECODE
             tile_data->mode_map[i][j] = j;
           }
         }
@@ -6406,7 +6414,12 @@ static void restore_encode_params(VP9_COMP *cpi) {
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#if CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    restore_encode_params(cpi);
+  }
+#endif  // CONFIG_RATE_CTRL
+#if CONFIG_CONSISTENT_RECODE
   restore_encode_params(cpi);
 #endif
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 34646465a6..bbd6dd030d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1022,10 +1022,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   cpi->mi_ssim_rdmult_scaling_factors = NULL;
 
 #if CONFIG_RATE_CTRL
-  free_partition_info(cpi);
-  free_motion_vector_info(cpi);
-  free_fp_motion_vector_info(cpi);
-  free_tpl_stats_info(cpi);
+  if (cpi->oxcf.use_simple_encode_api) {
+    free_partition_info(cpi);
+    free_motion_vector_info(cpi);
+    free_fp_motion_vector_info(cpi);
+    free_tpl_stats_info(cpi);
+  }
 #endif
 
   vp9_free_ref_frame_buffers(cm->buffer_pool);
@@ -2669,10 +2671,12 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 
 #if CONFIG_RATE_CTRL
   encode_command_init(&cpi->encode_command);
-  partition_info_init(cpi);
-  motion_vector_info_init(cpi);
-  fp_motion_vector_info_init(cpi);
-  tpl_stats_info_init(cpi);
+  if (oxcf->use_simple_encode_api) {
+    partition_info_init(cpi);
+    motion_vector_info_init(cpi);
+    fp_motion_vector_info_init(cpi);
+    tpl_stats_info_init(cpi);
+  }
 #endif
 
   return cpi;
@@ -4470,11 +4474,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       loop_at_this_size = 0;
     }
 
-#if CONFIG_RATE_CTRL
-    if (cpi->encode_command.use_external_target_frame_bits) {
-      q = rq_model_predict_q_index(rq_model, rq_history, rc->this_frame_target);
-    }
-#endif  // CONFIG_RATE_CTRL
     // Decide frame size bounds first time through.
     if (loop_count == 0) {
       vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
@@ -4517,10 +4516,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
 #if CONFIG_RATE_CTRL
     // TODO(angiebird): This is a hack for making sure the encoder use the
     // external_quantize_index exactly. Avoid this kind of hack later.
-    if (cpi->encode_command.use_external_quantize_index) {
-      q = cpi->encode_command.external_quantize_index;
+    if (cpi->oxcf.use_simple_encode_api) {
+      if (cpi->encode_command.use_external_target_frame_bits) {
+        q = rq_model_predict_q_index(rq_model, rq_history,
+                                     rc->this_frame_target);
+      }
+      if (cpi->encode_command.use_external_quantize_index) {
+        q = cpi->encode_command.external_quantize_index;
+      }
     }
-#endif
+#endif  // CONFIG_RATE_CTRL
     if (cpi->ext_ratectrl.ready && !ext_rc_recode) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -4607,33 +4612,36 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       ext_rc_recode = 1;
     }
 #if CONFIG_RATE_CTRL
-    // This part needs to be after save_coding_context() because
-    // restore_coding_context will be called in the end of this function.
-    // TODO(angiebird): This is a hack for making sure the encoder use the
-    // external_quantize_index exactly. Avoid this kind of hack later.
-    if (cpi->encode_command.use_external_quantize_index) {
-      break;
-    }
+    if (cpi->oxcf.use_simple_encode_api) {
+      // This part needs to be after save_coding_context() because
+      // restore_coding_context will be called in the end of this function.
+      // TODO(angiebird): This is a hack for making sure the encoder use the
+      // external_quantize_index exactly. Avoid this kind of hack later.
+      if (cpi->encode_command.use_external_quantize_index) {
+        break;
+      }
 
-    if (cpi->encode_command.use_external_target_frame_bits) {
-      const double percent_diff = get_bits_percent_diff(
-          rc->this_frame_target, rc->projected_frame_size);
-      update_rq_history(rq_history, rc->this_frame_target,
-                        rc->projected_frame_size, q);
-      loop_count += 1;
+      if (cpi->encode_command.use_external_target_frame_bits) {
+        const double percent_diff = get_bits_percent_diff(
+            rc->this_frame_target, rc->projected_frame_size);
+        update_rq_history(rq_history, rc->this_frame_target,
+                          rc->projected_frame_size, q);
+        loop_count += 1;
 
-      rq_model_update(rq_history, rc->this_frame_target, rq_model);
+        rq_model_update(rq_history, rc->this_frame_target, rq_model);
 
-      // Check if we hit the target bitrate.
-      if (percent_diff <= cpi->encode_command.target_frame_bits_error_percent ||
-          rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM ||
-          rq_history->q_index_low >= rq_history->q_index_high) {
-        break;
-      }
+        // Check if we hit the target bitrate.
+        if (percent_diff <=
+                cpi->encode_command.target_frame_bits_error_percent ||
+            rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM ||
+            rq_history->q_index_low >= rq_history->q_index_high) {
+          break;
+        }
 
-      loop = 1;
-      restore_coding_context(cpi);
-      continue;
+        loop = 1;
+        restore_coding_context(cpi);
+        continue;
+      }
     }
 #endif  // CONFIG_RATE_CTRL
 
@@ -5368,17 +5376,81 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) {
 }
 
 #if !CONFIG_REALTIME_ONLY
-static void update_encode_frame_result(
+static void update_encode_frame_result_basic(
+    FRAME_UPDATE_TYPE update_type, int show_idx, int quantize_index,
+    ENCODE_FRAME_RESULT *encode_frame_result) {
+  encode_frame_result->show_idx = show_idx;
+  encode_frame_result->update_type = update_type;
+  encode_frame_result->quantize_index = quantize_index;
+}
+
+#if CONFIG_RATE_CTRL
+static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer,
+                                        IMAGE_BUFFER *image_buffer) {
+  const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer,
+                                   yv12_buffer->v_buffer };
+  const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride,
+                                 yv12_buffer->uv_stride };
+  const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width,
+                        yv12_buffer->uv_crop_width };
+  const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height,
+                        yv12_buffer->uv_crop_height };
+  int plane;
+  for (plane = 0; plane < 3; ++plane) {
+    const int src_stride = src_stride_ls[plane];
+    const int w = w_ls[plane];
+    const int h = h_ls[plane];
+    const uint8_t *src_buf = src_buf_ls[plane];
+    uint8_t *dst_buf = image_buffer->plane_buffer[plane];
+    int r;
+    assert(image_buffer->plane_width[plane] == w);
+    assert(image_buffer->plane_height[plane] == h);
+    for (r = 0; r < h; ++r) {
+      memcpy(dst_buf, src_buf, sizeof(*src_buf) * w);
+      src_buf += src_stride;
+      dst_buf += w;
+    }
+  }
+}
+// This function will update extra information specific for simple_encode APIs
+static void update_encode_frame_result_simple_encode(
     int ref_frame_flags, FRAME_UPDATE_TYPE update_type,
     const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf,
-    RefCntBuffer *ref_frame_buf[MAX_INTER_REF_FRAMES], int quantize_index,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index,
     uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts,
-#if CONFIG_RATE_CTRL
     const PARTITION_INFO *partition_info,
     const MOTION_VECTOR_INFO *motion_vector_info,
     const TplDepStats *tpl_stats_info,
+    ENCODE_FRAME_RESULT *encode_frame_result) {
+  PSNR_STATS psnr;
+  update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+                                   quantize_index, encode_frame_result);
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth,
+                       input_bit_depth);
+#else   // CONFIG_VP9_HIGHBITDEPTH
+  (void)bit_depth;
+  (void)input_bit_depth;
+  vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index;
+
+  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+                         encode_frame_result->ref_frame_coding_indexes,
+                         encode_frame_result->ref_frame_valid_list);
+
+  encode_frame_result->psnr = psnr.psnr[0];
+  encode_frame_result->sse = psnr.sse[0];
+  encode_frame_result->frame_counts = *counts;
+  encode_frame_result->partition_info = partition_info;
+  encode_frame_result->motion_vector_info = motion_vector_info;
+  encode_frame_result->tpl_stats_info = tpl_stats_info;
+  if (encode_frame_result->coded_frame.allocated) {
+    yv12_buffer_to_image_buffer(&coded_frame_buf->buf,
+                                &encode_frame_result->coded_frame);
+  }
+}
 #endif  // CONFIG_RATE_CTRL
-    ENCODE_FRAME_RESULT *encode_frame_result);
 #endif  // !CONFIG_REALTIME_ONLY
 
 static void encode_frame_to_data_rate(
@@ -5473,10 +5545,14 @@ static void encode_frame_to_data_rate(
   memset(cpi->mode_chosen_counts, 0,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#if CONFIG_CONSISTENT_RECODE
   // Backup to ensure consistency between recodes
   save_encode_params(cpi);
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#elif CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    save_encode_params(cpi);
+  }
+#endif
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     if (!encode_without_recode_loop(cpi, size, dest)) return;
@@ -5568,10 +5644,12 @@ static void encode_frame_to_data_rate(
   assert(encode_frame_result == NULL);
 #else  // CONFIG_REALTIME_ONLY
   if (encode_frame_result != NULL) {
-    const int ref_frame_flags = get_ref_frame_flags(cpi);
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+    FRAME_UPDATE_TYPE update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    int quantize_index = vp9_get_quantizer(cpi);
     get_ref_frame_bufs(cpi, ref_frame_bufs);
     // update_encode_frame_result() depends on twopass.gf_group.index and
     // cm->new_fb_idx, cpi->Source, cpi->lst_fb_idx, cpi->gld_fb_idx and
@@ -5589,15 +5667,21 @@ static void encode_frame_to_data_rate(
     // This function needs to be called before vp9_update_reference_frames().
     // TODO(angiebird): Improve the codebase to make the update of frame
     // dependent variables more robust.
-    update_encode_frame_result(
-        ref_frame_flags,
-        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-        cpi->Source, coded_frame_buf, ref_frame_bufs, vp9_get_quantizer(cpi),
-        cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts,
+
+    update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+                                     quantize_index, encode_frame_result);
 #if CONFIG_RATE_CTRL
-        cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info,
+    if (cpi->oxcf.use_simple_encode_api) {
+      const int ref_frame_flags = get_ref_frame_flags(cpi);
+      update_encode_frame_result_simple_encode(
+          ref_frame_flags,
+          cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+          cpi->Source, coded_frame_buf, ref_frame_bufs, quantize_index,
+          cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts,
+          cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info,
+          encode_frame_result);
+    }
 #endif  // CONFIG_RATE_CTRL
-        encode_frame_result);
   }
 #endif  // CONFIG_REALTIME_ONLY
 
@@ -7517,7 +7601,9 @@ static void setup_tpl_stats(VP9_COMP *cpi) {
 #endif  // CONFIG_NON_GREEDY_MV
 
 #if CONFIG_RATE_CTRL
-  accumulate_frame_tpl_stats(cpi);
+  if (cpi->oxcf.use_simple_encode_api) {
+    accumulate_frame_tpl_stats(cpi);
+  }
 #endif  // CONFIG_RATE_CTRL
 }
 
@@ -7545,206 +7631,6 @@ void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
   }
 }
 
-#if !CONFIG_REALTIME_ONLY
-#if CONFIG_RATE_CTRL
-static void copy_frame_counts(const FRAME_COUNTS *input_counts,
-                              FRAME_COUNTS *output_counts) {
-  int i, j, k, l, m, n;
-  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
-    for (j = 0; j < INTRA_MODES; ++j) {
-      output_counts->y_mode[i][j] = input_counts->y_mode[i][j];
-    }
-  }
-  for (i = 0; i < INTRA_MODES; ++i) {
-    for (j = 0; j < INTRA_MODES; ++j) {
-      output_counts->uv_mode[i][j] = input_counts->uv_mode[i][j];
-    }
-  }
-  for (i = 0; i < PARTITION_CONTEXTS; ++i) {
-    for (j = 0; j < PARTITION_TYPES; ++j) {
-      output_counts->partition[i][j] = input_counts->partition[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZES; ++i) {
-    for (j = 0; j < PLANE_TYPES; ++j) {
-      for (k = 0; k < REF_TYPES; ++k) {
-        for (l = 0; l < COEF_BANDS; ++l) {
-          for (m = 0; m < COEFF_CONTEXTS; ++m) {
-            output_counts->eob_branch[i][j][k][l][m] =
-                input_counts->eob_branch[i][j][k][l][m];
-            for (n = 0; n < UNCONSTRAINED_NODES + 1; ++n) {
-              output_counts->coef[i][j][k][l][m][n] =
-                  input_counts->coef[i][j][k][l][m][n];
-            }
-          }
-        }
-      }
-    }
-  }
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
-      output_counts->switchable_interp[i][j] =
-          input_counts->switchable_interp[i][j];
-    }
-  }
-  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-    for (j = 0; j < INTER_MODES; ++j) {
-      output_counts->inter_mode[i][j] = input_counts->inter_mode[i][j];
-    }
-  }
-  for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->intra_inter[i][j] = input_counts->intra_inter[i][j];
-    }
-  }
-  for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->comp_inter[i][j] = input_counts->comp_inter[i][j];
-    }
-  }
-  for (i = 0; i < REF_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      for (k = 0; k < 2; ++k) {
-        output_counts->single_ref[i][j][k] = input_counts->single_ref[i][j][k];
-      }
-    }
-  }
-  for (i = 0; i < REF_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->comp_ref[i][j] = input_counts->comp_ref[i][j];
-    }
-  }
-  for (i = 0; i < SKIP_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->skip[i][j] = input_counts->skip[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES; j++) {
-      output_counts->tx.p32x32[i][j] = input_counts->tx.p32x32[i][j];
-    }
-    for (j = 0; j < TX_SIZES - 1; j++) {
-      output_counts->tx.p16x16[i][j] = input_counts->tx.p16x16[i][j];
-    }
-    for (j = 0; j < TX_SIZES - 2; j++) {
-      output_counts->tx.p8x8[i][j] = input_counts->tx.p8x8[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZES; i++) {
-    output_counts->tx.tx_totals[i] = input_counts->tx.tx_totals[i];
-  }
-  for (i = 0; i < MV_JOINTS; i++) {
-    output_counts->mv.joints[i] = input_counts->mv.joints[i];
-  }
-  for (k = 0; k < 2; k++) {
-    nmv_component_counts *const comps = &output_counts->mv.comps[k];
-    const nmv_component_counts *const comps_t = &input_counts->mv.comps[k];
-    for (i = 0; i < 2; i++) {
-      comps->sign[i] = comps_t->sign[i];
-      comps->class0_hp[i] = comps_t->class0_hp[i];
-      comps->hp[i] = comps_t->hp[i];
-    }
-    for (i = 0; i < MV_CLASSES; i++) {
-      comps->classes[i] = comps_t->classes[i];
-    }
-    for (i = 0; i < CLASS0_SIZE; i++) {
-      comps->class0[i] = comps_t->class0[i];
-      for (j = 0; j < MV_FP_SIZE; j++) {
-        comps->class0_fp[i][j] = comps_t->class0_fp[i][j];
-      }
-    }
-    for (i = 0; i < MV_OFFSET_BITS; i++) {
-      for (j = 0; j < 2; j++) {
-        comps->bits[i][j] = comps_t->bits[i][j];
-      }
-    }
-    for (i = 0; i < MV_FP_SIZE; i++) {
-      comps->fp[i] = comps_t->fp[i];
-    }
-  }
-}
-
-static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer,
-                                        IMAGE_BUFFER *image_buffer) {
-  const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer,
-                                   yv12_buffer->v_buffer };
-  const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride,
-                                 yv12_buffer->uv_stride };
-  const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width,
-                        yv12_buffer->uv_crop_width };
-  const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height,
-                        yv12_buffer->uv_crop_height };
-  int plane;
-  for (plane = 0; plane < 3; ++plane) {
-    const int src_stride = src_stride_ls[plane];
-    const int w = w_ls[plane];
-    const int h = h_ls[plane];
-    const uint8_t *src_buf = src_buf_ls[plane];
-    uint8_t *dst_buf = image_buffer->plane_buffer[plane];
-    int r;
-    assert(image_buffer->plane_width[plane] == w);
-    assert(image_buffer->plane_height[plane] == h);
-    for (r = 0; r < h; ++r) {
-      memcpy(dst_buf, src_buf, sizeof(*src_buf) * w);
-      src_buf += src_stride;
-      dst_buf += w;
-    }
-  }
-}
-#endif  // CONFIG_RATE_CTRL
-
-static void update_encode_frame_result(
-    int ref_frame_flags, FRAME_UPDATE_TYPE update_type,
-    const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf,
-    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index,
-    uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts,
-#if CONFIG_RATE_CTRL
-    const PARTITION_INFO *partition_info,
-    const MOTION_VECTOR_INFO *motion_vector_info,
-    const TplDepStats *tpl_stats_info,
-#endif  // CONFIG_RATE_CTRL
-    ENCODE_FRAME_RESULT *encode_frame_result) {
-#if CONFIG_RATE_CTRL
-  PSNR_STATS psnr;
-#if CONFIG_VP9_HIGHBITDEPTH
-  vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth,
-                       input_bit_depth);
-#else   // CONFIG_VP9_HIGHBITDEPTH
-  (void)bit_depth;
-  (void)input_bit_depth;
-  vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index;
-
-  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
-                         encode_frame_result->ref_frame_coding_indexes,
-                         encode_frame_result->ref_frame_valid_list);
-
-  encode_frame_result->psnr = psnr.psnr[0];
-  encode_frame_result->sse = psnr.sse[0];
-  copy_frame_counts(counts, &encode_frame_result->frame_counts);
-  encode_frame_result->partition_info = partition_info;
-  encode_frame_result->motion_vector_info = motion_vector_info;
-  encode_frame_result->tpl_stats_info = tpl_stats_info;
-  if (encode_frame_result->coded_frame.allocated) {
-    yv12_buffer_to_image_buffer(&coded_frame_buf->buf,
-                                &encode_frame_result->coded_frame);
-  }
-#else   // CONFIG_RATE_CTRL
-  (void)ref_frame_flags;
-  (void)bit_depth;
-  (void)input_bit_depth;
-  (void)source_frame;
-  (void)coded_frame_buf;
-  (void)ref_frame_bufs;
-  (void)counts;
-#endif  // CONFIG_RATE_CTRL
-  encode_frame_result->show_idx = coded_frame_buf->frame_index;
-  encode_frame_result->update_type = update_type;
-  encode_frame_result->quantize_index = quantize_index;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
 void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) {
   encode_frame_result->show_idx = -1;  // Actual encoding doesn't happen.
 #if CONFIG_RATE_CTRL
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 12520fb82a..65a1d33286 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -291,6 +291,7 @@ typedef struct VP9EncoderConfig {
   int row_mt;
   unsigned int motion_vector_unit_test;
   int delta_q_uv;
+  int use_simple_encode_api;  // Use SimpleEncode APIs or not
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 375438839b..7343d1bc66 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1114,8 +1114,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
       vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
 
 #if CONFIG_RATE_CTRL
-      // Store zero mv as default
-      store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+      if (cpi->oxcf.use_simple_encode_api) {
+        // Store zero mv as default
+        store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+      }
 #endif  // CONFIG_RAGE_CTRL
 
       xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
@@ -1183,7 +1185,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
           }
         }
 #if CONFIG_RATE_CTRL
-        store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+        if (cpi->oxcf.use_simple_encode_api) {
+          store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+        }
 #endif  // CONFIG_RAGE_CTRL
 
         // Search in an older reference frame.
@@ -1207,7 +1211,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
 
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);
 #if CONFIG_RATE_CTRL
-          store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME, 1);
+          if (cpi->oxcf.use_simple_encode_api) {
+            store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME,
+                                   1);
+          }
 #endif  // CONFIG_RAGE_CTRL
 
           if (gf_motion_error < motion_error && gf_motion_error < this_error)
@@ -1383,7 +1390,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     } else {
       fp_acc_data->sr_coded_error += (int64_t)this_error;
 #if CONFIG_RATE_CTRL
-      store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0);
+      if (cpi->oxcf.use_simple_encode_api) {
+        store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0);
+      }
 #endif  // CONFIG_RAGE_CTRL
     }
     fp_acc_data->coded_error += (int64_t)this_error;
@@ -1412,9 +1421,11 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
   vp9_tile_init(tile, cm, 0, 0);
 
 #if CONFIG_RATE_CTRL
-  fp_motion_vector_info_reset(cpi->frame_info.frame_width,
-                              cpi->frame_info.frame_height,
-                              cpi->fp_motion_vector_info);
+  if (cpi->oxcf.use_simple_encode_api) {
+    fp_motion_vector_info_reset(cpi->frame_info.frame_width,
+                                cpi->frame_info.frame_height,
+                                cpi->fp_motion_vector_info);
+  }
 #endif
 
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
@@ -2677,25 +2688,25 @@ static int get_gop_coding_frame_num(
   return gop_coding_frames;
 }
 
-static RANGE get_active_gf_inverval_range(
-    const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf,
-    int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) {
+static RANGE get_active_gf_inverval_range_simple(int min_gf_interval,
+                                                 int arf_active_or_kf,
+                                                 int frames_to_key) {
   RANGE active_gf_interval;
-#if CONFIG_RATE_CTRL
-  (void)frame_info;
-  (void)gf_start_show_idx;
-  (void)active_worst_quality;
-  (void)last_boosted_qindex;
-  active_gf_interval.min = rc->min_gf_interval + arf_active_or_kf + 2;
-
+  active_gf_interval.min = min_gf_interval + arf_active_or_kf + 2;
   active_gf_interval.max = 16 + arf_active_or_kf;
 
-  if ((active_gf_interval.max <= rc->frames_to_key) &&
-      (active_gf_interval.max >= (rc->frames_to_key - rc->min_gf_interval))) {
-    active_gf_interval.min = rc->frames_to_key / 2;
-    active_gf_interval.max = rc->frames_to_key / 2;
+  if ((active_gf_interval.max <= frames_to_key) &&
+      (active_gf_interval.max >= (frames_to_key - min_gf_interval))) {
+    active_gf_interval.min = frames_to_key / 2;
+    active_gf_interval.max = frames_to_key / 2;
   }
-#else
+  return active_gf_interval;
+}
+
+static RANGE get_active_gf_inverval_range(
+    const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf,
+    int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) {
+  RANGE active_gf_interval;
   int int_max_q = (int)(vp9_convert_qindex_to_q(active_worst_quality,
                                                 frame_info->bit_depth));
   int q_term = (gf_start_show_idx == 0)
@@ -2733,7 +2744,6 @@ static RANGE get_active_gf_inverval_range(
   }
   active_gf_interval.max =
       VPXMAX(active_gf_interval.max, active_gf_interval.min);
-#endif
   return active_gf_interval;
 }
 
@@ -2794,9 +2804,14 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
 
   vpx_clear_system_state();
 
-  active_gf_interval = get_active_gf_inverval_range(
-      frame_info, rc, arf_active_or_kf, gf_start_show_idx,
-      twopass->active_worst_quality, rc->last_boosted_qindex);
+  if (oxcf->use_simple_encode_api) {
+    active_gf_interval = get_active_gf_inverval_range_simple(
+        rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
+  } else {
+    active_gf_interval = get_active_gf_inverval_range(
+        frame_info, rc, arf_active_or_kf, gf_start_show_idx,
+        twopass->active_worst_quality, rc->last_boosted_qindex);
+  }
 
   if (cpi->multi_layer_arf) {
     int arf_layers = get_arf_layers(cpi->multi_layer_arf, oxcf->enable_auto_arf,
@@ -2806,25 +2821,21 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     gop_intra_factor = 1.0;
   }
 
+  gop_coding_frames = get_gop_coding_frame_num(
+      &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
+      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
+  use_alt_ref &= allow_alt_ref;
 #if CONFIG_RATE_CTRL
-  {
+  // If the external gop_command is on, we will override the decisions
+  // of gop_coding_frames and use_alt_ref.
+  if (cpi->oxcf.use_simple_encode_api) {
     const GOP_COMMAND *gop_command = &cpi->encode_command.gop_command;
     assert(allow_alt_ref == 1);
     if (gop_command->use) {
       gop_coding_frames = gop_command_coding_frame_count(gop_command);
       use_alt_ref = gop_command->use_alt_ref;
-    } else {
-      gop_coding_frames = get_gop_coding_frame_num(
-          &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
-          &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
-      use_alt_ref &= allow_alt_ref;
     }
   }
-#else
-  gop_coding_frames = get_gop_coding_frame_num(
-      &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
-      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
-  use_alt_ref &= allow_alt_ref;
 #endif
 
   // Was the group length constrained by the requirement for a new KF?
@@ -3855,12 +3866,19 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
   int frame_count;
   double gop_intra_factor;
   const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame;
-  RANGE active_gf_interval = get_active_gf_inverval_range(
-      frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0,
-      /*last_boosted_qindex=*/0);
+  RANGE active_gf_interval;
+  int arf_layers;
+  if (oxcf->use_simple_encode_api) {
+    active_gf_interval = get_active_gf_inverval_range_simple(
+        rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
+  } else {
+    active_gf_interval = get_active_gf_inverval_range(
+        frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0,
+        /*last_boosted_qindex=*/0);
+  }
 
-  const int arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf,
-                                        active_gf_interval.max);
+  arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf,
+                              active_gf_interval.max);
   if (multi_layer_arf) {
     gop_intra_factor = 1.0 + 0.25 * arf_layers;
   } else {
@@ -3877,7 +3895,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
 // Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of
 // coding frames (including show frame and alt ref) can be determined.
 int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
-                             TWO_PASS *const twopass,
+                             const TWO_PASS *const twopass,
                              const FRAME_INFO *frame_info, int multi_layer_arf,
                              int allow_alt_ref) {
   const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index ff0eb40c87..ddfc87d894 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -305,7 +305,7 @@ int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf,
                                    int last_gop_use_alt_ref, int *use_alt_ref);
 
 int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf,
-                             TWO_PASS *const twopass,
+                             const TWO_PASS *const twopass,
                              const FRAME_INFO *frame_info, int multi_layer_arf,
                              int allow_alt_ref);
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 3775d22361..043036721a 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1720,10 +1720,12 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
   }
 
 #if CONFIG_RATE_CTRL
-  if (cpi->encode_command.use_external_target_frame_bits) {
-    rc->this_frame_target = cpi->encode_command.target_frame_bits;
+  if (cpi->oxcf.use_simple_encode_api) {
+    if (cpi->encode_command.use_external_target_frame_bits) {
+      rc->this_frame_target = cpi->encode_command.target_frame_bits;
+    }
   }
-#endif
+#endif  // CONFIG_RATE_CTRL
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) /
@@ -2536,26 +2538,25 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
     rc->min_gf_interval = FIXED_GF_INTERVAL;
     rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
   } else {
+    double framerate = cpi->framerate;
     // Set Maximum gf/arf interval
     rc->max_gf_interval = oxcf->max_gf_interval;
     rc->min_gf_interval = oxcf->min_gf_interval;
 #if CONFIG_RATE_CTRL
+    if (oxcf->use_simple_encode_api) {
+      // In this experiment, we avoid framerate being changed dynamically during
+      // encoding.
+      framerate = oxcf->init_framerate;
+    }
+#endif  // CONFIG_RATE_CTRL
     if (rc->min_gf_interval == 0) {
       rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, oxcf->init_framerate);
+          oxcf->width, oxcf->height, framerate);
     }
     if (rc->max_gf_interval == 0) {
-      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-          oxcf->init_framerate, rc->min_gf_interval);
+      rc->max_gf_interval =
+          vp9_rc_get_default_max_gf_interval(framerate, rc->min_gf_interval);
     }
-#else
-    if (rc->min_gf_interval == 0)
-      rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, cpi->framerate);
-    if (rc->max_gf_interval == 0)
-      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-          cpi->framerate, rc->min_gf_interval);
-#endif
 
     // Extended max interval for genuinely static scenes like slide shows.
     rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 87727cb12a..6ba37a321c 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -793,6 +793,7 @@ static VP9EncoderConfig GetEncodeConfig(
   if (enc_pass == VPX_RC_FIRST_PASS) {
     oxcf.lag_in_frames = 0;
   }
+  oxcf.use_simple_encode_api = 1;
   return oxcf;
 }
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 438f9b5ed9..7697806ce0 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -652,6 +652,7 @@ static vpx_codec_err_t set_encoder_config(
   }
 
   if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf);
+  oxcf->use_simple_encode_api = 0;
   // vp9_dump_encoder_config(oxcf, stderr);
   return VPX_CODEC_OK;
 }
@@ -2288,6 +2289,8 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) {
 
   DUMP_STRUCT_VALUE(fp, oxcf, row_mt);
   DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test);
+  DUMP_STRUCT_VALUE(fp, oxcf, delta_q_uv);
+  DUMP_STRUCT_VALUE(fp, oxcf, use_simple_encode_api);
 }
 
 FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) {

From a00c56373ea7d12bf6b1ab8060ccdb4af03c1794 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 21 Jun 2021 17:22:51 -0700
Subject: [PATCH 118/926] rc: turn off gf constrain for external RC

Added a new flag in rate control which turns off gf interval constrain
on key frame frequency for external RC.

It remains on for libvpx.

Change-Id: I18bb0d8247a421193f023619f906d0362b873b31
---
 test/ratectrl_rtc_test.cc  | 7 ++-----
 vp9/encoder/vp9_ratectrl.c | 4 +++-
 vp9/encoder/vp9_ratectrl.h | 4 ++++
 vp9/ratectrl_rtc.cc        | 6 ++----
 vp9/ratectrl_rtc.h         | 2 --
 5 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index e9a9f15e9f..5e5a179b29 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -61,6 +61,7 @@ struct FrameInfo {
 //  - Set rc_end_usage to VPX_VBR
 //  - AQ Mode 0
 //  - Disable vp9_compute_frame_low_motion in vp9_encoder.c
+//  - Set rc->constrain_gf_key_freq_onepass_vbr = 0 in vp9_rc_init
 // examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9
 //    1280 720 1 30 7 0 0 1 0 1000
 //
@@ -192,8 +193,7 @@ class RcInterfaceTest : public ::testing::Test {
     for (size_t i = 0; i < kNumFrame; i++) {
       one_layer_file >> frame_info;
       if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      if (frame_info.frame_id % rc_cfg_.key_freq == 0)
-        frame_params.frame_type = KEY_FRAME;
+      if (frame_info.frame_id % 300 == 0) frame_params.frame_type = KEY_FRAME;
       ASSERT_EQ(frame_info.spatial_id, 0);
       ASSERT_EQ(frame_info.temporal_id, 0);
       rc_api_->ComputeQP(frame_params);
@@ -229,19 +229,16 @@ class RcInterfaceTest : public ::testing::Test {
   void SetConfigOneLayerCBR() {
     SetConfig();
     rc_cfg_.rc_mode = VPX_CBR;
-    rc_cfg_.key_freq = 3000;
   }
 
   void SetConfigOneLayerVBR() {
     SetConfig();
     rc_cfg_.rc_mode = VPX_VBR;
-    rc_cfg_.key_freq = 3000;
   }
 
   void SetConfigOneLayerVBRPeriodicKey() {
     SetConfig();
     rc_cfg_.rc_mode = VPX_VBR;
-    rc_cfg_.key_freq = 300;
   }
 
   void SetConfigSVC() {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index dbbd458c96..26e1d5381f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -407,6 +407,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->source_alt_ref_active = 0;
 
   rc->frames_till_gf_update_due = 0;
+  rc->constrain_gf_key_freq_onepass_vbr = 1;
   rc->ni_av_qi = oxcf->worst_allowed_q;
   rc->ni_tot_qi = 0;
   rc->ni_frames = 0;
@@ -2083,7 +2084,8 @@ void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) {
         rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
-    adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
+    if (rc->constrain_gf_key_freq_onepass_vbr)
+      adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = 0;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index bdddd2df8b..83a12cde73 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -207,6 +207,10 @@ typedef struct {
   int preserve_arf_as_gld;
   int preserve_next_arf_as_gld;
   int show_arf_as_gld;
+
+  // Flag to constrain golden frame interval on key frame frequency for 1 pass
+  // VBR.
+  int constrain_gf_key_freq_onepass_vbr;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 8455ca9a3d..2595a2bc07 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -46,7 +46,6 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   oxcf->content = VP9E_CONTENT_DEFAULT;
   oxcf->drop_frames_water_mark = 0;
   cm->current_video_frame = 0;
-  oxcf->key_freq = rc_cfg.key_freq;
   rc->kf_boost = DEFAULT_KF_BOOST;
 
   UpdateRateControl(rc_cfg);
@@ -60,7 +59,7 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   rc->rc_2_frame = 0;
   vp9_rc_init_minq_luts();
   vp9_rc_init(oxcf, 0, rc);
-  rc->frames_to_key = oxcf->key_freq;
+  rc->constrain_gf_key_freq_onepass_vbr = 0;
   cpi_->sf.use_nonrd_pick_mode = 1;
 }
 
@@ -152,8 +151,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
         target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_);
     } else if (cpi_->oxcf.rc_mode == VPX_VBR) {
       if (cm->frame_type == KEY_FRAME) {
-        cpi_->rc.this_key_frame_forced =
-            cm->current_video_frame != 0 && cpi_->rc.frames_to_key == 0;
+        cpi_->rc.this_key_frame_forced = cm->current_video_frame != 0;
         cpi_->rc.frames_to_key = cpi_->oxcf.key_freq;
       }
       vp9_set_gf_update_one_pass_vbr(cpi_);
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index a1f2767126..c7c0505e33 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -51,8 +51,6 @@ struct VP9RateControlRtcConfig {
   int ts_rate_decimator[VPX_TS_MAX_LAYERS];
   // vbr, cbr
   enum vpx_rc_mode rc_mode;
-  // key frame frequency
-  int key_freq;
 };
 
 struct VP9FrameParamsQpRTC {

From bd53f0cc9faefbca2dcb6b21b6849d5e24141c9c Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 24 Jun 2021 13:13:50 -0700
Subject: [PATCH 119/926] Add constructor to VP9RateControlRtcConfig

Also add max_inter_bitrate_pct

Change-Id: Ie2c0e7f1397ca0bb55214251906412cdf24e42e2
---
 vp9/ratectrl_rtc.cc |  1 +
 vp9/ratectrl_rtc.h  | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 2595a2bc07..b38a0db9c0 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -90,6 +90,7 @@ void VP9RateControlRTC::UpdateRateControl(
       (rc_cfg.ts_number_layers > 1) ? rc_cfg.ts_number_layers : 0);
 
   cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->oxcf.rc_max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
   cpi_->framerate = rc_cfg.framerate;
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index c7c0505e33..a30ec1da23 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -26,6 +26,36 @@
 namespace libvpx {
 
 struct VP9RateControlRtcConfig {
+ public:
+  VP9RateControlRtcConfig() {
+    width = 1280;
+    height = 720;
+    max_quantizer = 63;
+    min_quantizer = 2;
+    target_bandwidth = 1000;
+    buf_initial_sz = 600;
+    buf_optimal_sz = 600;
+    buf_sz = 1000;
+    undershoot_pct = overshoot_pct = 50;
+    max_intra_bitrate_pct = 50;
+    max_inter_bitrate_pct = 0;
+    framerate = 30.0;
+    ss_number_layers = ts_number_layers = 1;
+    rc_mode = VPX_CBR;
+    vp9_zero(max_quantizers);
+    vp9_zero(min_quantizers);
+    vp9_zero(scaling_factor_den);
+    vp9_zero(scaling_factor_num);
+    vp9_zero(layer_target_bitrate);
+    vp9_zero(ts_rate_decimator);
+    scaling_factor_num[0] = 1;
+    scaling_factor_den[0] = 1;
+    layer_target_bitrate[0] = target_bandwidth;
+    max_quantizers[0] = max_quantizer;
+    min_quantizers[0] = min_quantizer;
+    ts_rate_decimator[0] = 1;
+  }
+
   int width;
   int height;
   // 0-63
@@ -38,6 +68,7 @@ struct VP9RateControlRtcConfig {
   int undershoot_pct;
   int overshoot_pct;
   int max_intra_bitrate_pct;
+  int max_inter_bitrate_pct;
   double framerate;
   // Number of spatial layers
   int ss_number_layers;

From 67bfbcfbf706766c841ac900b6cd2165c651983c Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 24 Jun 2021 23:34:36 -0700
Subject: [PATCH 120/926] vp9-rtc: Extract content dependency in cyclic refresh

For usage in the external RC. When content_mode = 0,
the cyclic refresh has no dependency on the content
(motion, spatial variance, motion vectors, etc,).

The content_mode = 0, when compared to content_mode = 1,
on rtc set for speed 7: has some regression on some
clips (~3-5%), but overall/average bdrate loss is
about ~1-2%.

Comparing aq_mode=3 with content_mode = 0, vs aq_mode=3:
about ~14% avg/overall bdrate gain, but has ~3-7% regression
on some hard motion clip (e.g.m street).

Change-Id: I93117fabb8f7f89032c15baf1292b201e8c07362
---
 vp9/encoder/vp9_aq_cyclicrefresh.c | 23 +++++++++++++++++------
 vp9/encoder/vp9_aq_cyclicrefresh.h |  1 +
 vp9/encoder/vp9_encodeframe.c      |  9 ++++++---
 vp9/encoder/vp9_encoder.c          |  2 +-
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 858a416546..e6edf5a925 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -48,6 +48,7 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
   cr->counter_encode_maxq_scene_change = 0;
+  cr->content_mode = 1;
   return cr;
 }
 
@@ -326,7 +327,8 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
   else
     rc->baseline_gf_interval = 40;
   if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
-  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40)
+  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40 &&
+      cr->content_mode)
     rc->baseline_gf_interval = 10;
 }
 
@@ -388,7 +390,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
           ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
           : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
   // More aggressive settings for noisy content.
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+      cr->content_mode) {
     consec_zero_mv_thresh = 60;
     qindex_thresh =
         VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
@@ -409,7 +412,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->common.use_highbitdepth) compute_content = 0;
 #endif
-    if (cpi->Last_Source == NULL ||
+    if (cr->content_mode == 0 || cpi->Last_Source == NULL ||
         cpi->Last_Source->y_width != cpi->Source->y_width ||
         cpi->Last_Source->y_height != cpi->Source->y_height)
       compute_content = 0;
@@ -430,7 +433,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         // reset to 0 later depending on the coding mode.
         if (cr->map[bl_index2] == 0) {
           count_tot++;
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
+          if (cr->content_mode == 0 ||
+              cr->last_coded_q_map[bl_index2] > qindex_thresh ||
               cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
             sum_map++;
             count_sel++;
@@ -489,7 +493,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
       rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
       (cpi->use_svc &&
        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
-      (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
+      (!cpi->use_svc && cr->content_mode &&
+       rc->avg_frame_low_motion < thresh_low_motion &&
        rc->frames_since_key > 40) ||
       (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh &&
        rc->frames_since_key > 20)) {
@@ -528,7 +533,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10;
     // Increase the amount of refresh on scene change that is encoded at max Q,
     // increase for a few cycles of the refresh period (~100 / percent_refresh).
-    if (cr->counter_encode_maxq_scene_change < 30)
+    if (cr->content_mode && cr->counter_encode_maxq_scene_change < 30)
       cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15;
     cr->rate_ratio_qdelta = 2.0;
     cr->rate_boost_fac = 10;
@@ -575,6 +580,12 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
         (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) /
         num8x8bl;
   cr->weight_segment = weight_segment;
+  if (cr->content_mode == 0) {
+    cr->actual_num_seg1_blocks =
+        cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+    cr->actual_num_seg2_blocks = 0;
+    cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num8x8bl;
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index b6d7fdeae7..c74cee4743 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -70,6 +70,7 @@ struct CYCLIC_REFRESH {
   int apply_cyclic_refresh;
   int counter_encode_maxq_scene_change;
   int skip_flat_static_blocks;
+  int content_mode;
 };
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 00855319d6..969fad59b1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1842,7 +1842,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode) {
       vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row, mi_col, bsize,
                                         ctx->rate, ctx->dist, x->skip, p);
     }
@@ -2539,7 +2540,8 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
 
   if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) {
     // Setting segmentation map for cyclic_refresh.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode) {
       vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
                                         ctx->rate, ctx->dist, x->skip, p);
     } else {
@@ -6716,7 +6718,8 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
 
     ++td->counts->tx.tx_totals[mi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
-    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode)
       vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
     if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
         (!cpi->use_svc ||
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index bbd6dd030d..1af83e405a 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4208,7 +4208,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   // Update some stats from cyclic refresh, and check for golden frame update.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-      !frame_is_intra_only(cm))
+      !frame_is_intra_only(cm) && cpi->cyclic_refresh->content_mode)
     vp9_cyclic_refresh_postencode(cpi);
 
   // Update the skip mb flag probabilities based on the distribution

From fe1c7d2d8cce13d9cd1edfe11a6703e5521ae561 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Thu, 17 Jun 2021 15:36:18 -0700
Subject: [PATCH 121/926] Disallow skipping transform and quantization

The encoder has a feature to skip transform and quantization based
on model rd analysis. It could happen that the model
based analysis lets the encoder skips transform and quantization, while
a bad prediction occurs, leading to bad reconstructed blocks, which
are intrusive and apparently coding errors.

We add a speed feature to guard the skipping feature.
Due to the risk of bad perceptual quality, we disallow such skipping
by default.

On hdres test set, speed 2, the coding performance difference is 0.025%,
speed difference is 1.2%, which can be considered non significant.

BUG=webm:1729

Change-Id: I48af01ae8dcc7a76c05c695f3f3e68b866c89574
---
 vp9/encoder/vp9_block.h          |  3 +++
 vp9/encoder/vp9_rdopt.c          | 23 ++++++++++-------------
 vp9/encoder/vp9_speed_features.c |  1 +
 vp9/encoder/vp9_speed_features.h |  6 ++++++
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 37a4605ad8..20294b4b94 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -157,6 +157,9 @@ struct macroblock {
   // skip forward transform and quantization
   uint8_t skip_txfm[MAX_MB_PLANE << 2];
 #define SKIP_TXFM_NONE 0
+// TODO(chengchen): consider remove SKIP_TXFM_AC_DC from vp9 completely
+// since it increases risks of bad perceptual quality.
+// https://crbug.com/webm/1729
 #define SKIP_TXFM_AC_DC 1
 #define SKIP_TXFM_AC_ONLY 2
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 37de4e4839..a1687dcf46 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -745,8 +745,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   MODE_INFO *const mi = xd->mi[0];
   int64_t rd1, rd2, rd;
   int rate;
-  int64_t dist;
-  int64_t sse;
+  int64_t dist = INT64_MAX;
+  int64_t sse = INT64_MAX;
   const int coeff_ctx =
       combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]);
   struct buf_2d *recon = args->this_recon;
@@ -799,6 +799,13 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     if (max_txsize_lookup[plane_bsize] == tx_size)
       skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))];
 
+    // This reduces the risk of bad perceptual quality due to bad prediction.
+    // We always force the encoder to perform transform and quantization.
+    if (!args->cpi->sf.allow_skip_txfm_ac_dc &&
+        skip_txfm_flag == SKIP_TXFM_AC_DC) {
+      skip_txfm_flag = SKIP_TXFM_NONE;
+    }
+
     if (skip_txfm_flag == SKIP_TXFM_NONE ||
         (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
       // full forward transform and quantization
@@ -827,17 +834,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
         dist = VPXMAX(0, sse - dc_correct);
       }
     } else {
-      // SKIP_TXFM_AC_DC
-      // skip forward transform. Because this is handled here, the quantization
-      // does not need to do it.
-      x->plane[plane].eobs[block] = 0;
-      sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
-      dist = sse;
-      if (recon) {
-        uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
-        copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
-                           blk_row, blk_col, plane_bsize, tx_bsize);
-      }
+      assert(0 && "allow_skip_txfm_ac_dc does not allow SKIP_TXFM_AC_DC.");
     }
   }
 
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 585c9604c6..fc7a67c9f1 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -940,6 +940,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->enable_tpl_model = oxcf->enable_tpl_model;
   sf->prune_ref_frame_for_rect_partitions = 0;
   sf->temporal_filter_search_method = MESH;
+  sf->allow_skip_txfm_ac_dc = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index ca284ded82..5ea04709ec 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -612,6 +612,12 @@ typedef struct SPEED_FEATURES {
   // For real-time mode: force DC only under intra search when content
   // does not have high souce SAD.
   int rt_intra_dc_only_low_content;
+
+  // The encoder has a feature that skips forward transform and quantization
+  // based on a model rd estimation to reduce encoding time.
+  // However, this feature is dangerous since it could lead to bad perceptual
+  // quality. This flag is added to guard the feature.
+  int allow_skip_txfm_ac_dc;
 } SPEED_FEATURES;
 
 struct VP9_COMP;

From 5f345a9246b71374cbedeb28b0e0b0101701732a Mon Sep 17 00:00:00 2001
From: "Jorge E. Moreira" <jemoreira@google.com>
Date: Wed, 30 Jun 2021 11:33:51 -0700
Subject: [PATCH 122/926] Avoid overflow in calc_iframe_target_size

The changed product was observed to attempt to multiply 1800 by 2500000,
which overflows unsigned 32 bits. Converting to unsigned 64 bits first
and testing whether the final result fits in 32 bits solves the problem.

BUG=b:179686142

Change-Id: I5d27317bf14b0311b739144c451d8e172db01945
---
 vp8/encoder/ratectrl.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index ba124c359e..d2b8dff06a 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -349,8 +349,12 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
   }
 
   if (cpi->oxcf.rc_max_intra_bitrate_pct) {
-    unsigned int max_rate =
-        cpi->per_frame_bandwidth * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+    unsigned int max_rate;
+    // This product may overflow unsigned int
+    uint64_t product = cpi->per_frame_bandwidth;
+    product *= cpi->oxcf.rc_max_intra_bitrate_pct;
+    product /= 100;
+    max_rate = (unsigned int)VPXMIN(INT_MAX, product);
 
     if (target > max_rate) target = max_rate;
   }

From 350b0b47f2b126ae33607002590d58aca18033bc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 1 Jul 2021 22:16:42 -0700
Subject: [PATCH 123/926] ratectrl_rtc.h: quiet MSVC int64_t->int conv warning

target_bandwidth is int64_t, but layer_target_bitrate[0] is an int. this
is safe in the only place it's set because target_bandwidth defaults to
1000. target_bandwidth is later used to populate the cpi's target, which
is an unsigned int so there may be further fixes/cleanups that can be
done.

Change-Id: I35dbaa2e55a0fca22e0e2680dcac9ea4c6b2815a
---
 vp9/ratectrl_rtc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index a30ec1da23..f219f24500 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -50,7 +50,7 @@ struct VP9RateControlRtcConfig {
     vp9_zero(ts_rate_decimator);
     scaling_factor_num[0] = 1;
     scaling_factor_den[0] = 1;
-    layer_target_bitrate[0] = target_bandwidth;
+    layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
     max_quantizers[0] = max_quantizer;
     min_quantizers[0] = min_quantizer;
     ts_rate_decimator[0] = 1;

From c64022fa3c3019da117fbafbe80535a9ffbd8163 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 2 Jul 2021 11:28:48 -0700
Subject: [PATCH 124/926] Add codec control to get loopfilter level

Change-Id: I70d417da900082160e7ba53315af98eceede257c
---
 vp9/vp9_cx_iface.c | 9 +++++++++
 vpx/vp8cx.h        | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 7697806ce0..2c09a3992d 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -834,6 +834,14 @@ static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = ctx->cpi->common.lf.filter_level;
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
                                         const struct vp9_extracfg *extra_cfg) {
   const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
@@ -1967,6 +1975,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level },
   { VP9_GET_REFERENCE, ctrl_get_reference },
   { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id },
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 37ad07d33d..a5dd324b70 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -712,6 +712,12 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP9
    */
   VP9E_SET_EXTERNAL_RATE_CONTROL,
+
+  /*!\brief Codec control function to get loopfilter level in the encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LOOPFILTER_LEVEL,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -1037,6 +1043,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
 
+VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL
+
 VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST
 

From df7dc31cdfaa81e20fd0f4aed4c5eff037f484c4 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Jul 2021 15:08:05 -0700
Subject: [PATCH 125/926] Document vpx_img_set_rect() more precisely

Document the side effects and return value of vpx_img_set_rect() more
precisely.

Change-Id: Id1120bc478ff090a70b4ddd23c4798026bbefe10
---
 vpx/vpx_image.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index bc23be50c5..1adc9b9d9e 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -171,7 +171,8 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
 /*!\brief Set the rectangle identifying the displayed portion of the image
  *
  * Updates the displayed rectangle (aka viewport) on the image surface to
- * match the specified coordinates and size.
+ * match the specified coordinates and size. Specifically, sets img->d_w,
+ * img->d_h, and elements of the img->planes[] array.
  *
  * \param[in]    img       Image descriptor
  * \param[in]    x         leftmost column
@@ -179,7 +180,7 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
  * \param[in]    w         width
  * \param[in]    h         height
  *
- * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise.
  */
 int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h);

From 69fc604636f740a57482f3898c2527d29663ee6d Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Jul 2021 15:17:48 -0700
Subject: [PATCH 126/926] Check for addition overflows in vpx_img_set_rect()

Check for x + w and y + h overflows in vpx_img_set_rect().

Move the declaration of the local variable 'data' to the block it is
used in.

Change-Id: I6bda875e1853c03135ec6ce29015bcc78bb8b7ba
---
 vpx/src/vpx_image.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 2a7afc00c2..f9f0dd6025 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -152,9 +153,8 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
 
 int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h) {
-  unsigned char *data;
-
-  if (x + w <= img->w && y + h <= img->h) {
+  if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h &&
+      y + h <= img->h) {
     img->d_w = w;
     img->d_h = h;
 
@@ -165,7 +165,7 @@ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
     } else {
       const int bytes_per_sample =
           (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
-      data = img->img_data;
+      unsigned char *data = img->img_data;
 
       if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
         img->planes[VPX_PLANE_ALPHA] =

From 76ad30b6fb85f1462b28323220960d165d167e78 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 13 Jul 2021 11:54:34 -0700
Subject: [PATCH 127/926] Add codec control for rtc external ratectrl lib

This will do 3 things:

Turn off low motion computation
Turn off gf update constrain on key frame frequency
turn off content mode for cyclic refresh

Those are used to verify the external ratectrl lib works as expected.

Change-Id: Ic6e61498de82d6b3973e58df246cf5e05f838680
---
 vp9/encoder/vp9_encoder.c |  4 +++-
 vp9/encoder/vp9_encoder.h |  2 ++
 vp9/vp9_cx_iface.c        | 13 +++++++++++++
 vpx/vp8cx.h               | 17 +++++++++++++++++
 4 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 1af83e405a..f50b979979 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2304,6 +2304,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
       cm, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
+  cpi->compute_frame_low_motion_onepass = 1;
   cpi->use_svc = 0;
   cpi->resize_state = ORIG;
   cpi->external_resize = 0;
@@ -5747,7 +5748,8 @@ static void encode_frame_to_data_rate(
 
   vp9_rc_postencode_update(cpi, *size);
 
-  if (oxcf->pass == 0 && !frame_is_intra_only(cm) &&
+  if (cpi->compute_frame_low_motion_onepass && oxcf->pass == 0 &&
+      !frame_is_intra_only(cm) &&
       (!cpi->use_svc ||
        (cpi->use_svc &&
         !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 65a1d33286..ea2d59e1b5 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -961,6 +961,8 @@ typedef struct VP9_COMP {
 
   int compute_source_sad_onepass;
 
+  int compute_frame_low_motion_onepass;
+
   LevelConstraint level_constraint;
 
   uint8_t *count_arf_frame_usage;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 2c09a3992d..906f2b0b85 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1039,6 +1039,18 @@ static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  if (data) {
+    cpi->compute_frame_low_motion_onepass = 0;
+    cpi->rc.constrain_gf_key_freq_onepass_vbr = 0;
+    cpi->cyclic_refresh->content_mode = 0;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_enable_motion_vector_unit_test(
     vpx_codec_alg_priv_t *ctx, va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -1970,6 +1982,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync },
   { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv },
   { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter },
+  { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control },
 
   // Getters
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index a5dd324b70..011dfcba52 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -713,6 +713,20 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_EXTERNAL_RATE_CONTROL,
 
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will do 3 things, only for 1 pass:
+   *  - Turn off low motion computation
+   *  - Turn off gf update constraint on key frame frequency
+   *  - Turn off content mode for cyclic refresh
+   *
+   * With those, the rate control is expected to work exactly the same as the
+   * interface provided in ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_RTC_EXTERNAL_RATECTRL,
+
   /*!\brief Codec control function to get loopfilter level in the encoder.
    *
    * Supported in codecs: VP9
@@ -1077,6 +1091,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int)
 VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int)
 #define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER
 
+VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL
+
 VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
 #define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
 

From b1f2532b4d670b2286f50d240992e58402a70aea Mon Sep 17 00:00:00 2001
From: Bohan Li <bohanli@google.com>
Date: Thu, 15 Jul 2021 13:21:35 -0700
Subject: [PATCH 128/926] Avoid chroma resampling for 420mpeg2 input

BUG=aomedia:3080

Change-Id: I4ed81abf4b799224085485560f675c10c318cde6
---
 y4minput.c | 31 ++-----------------------------
 1 file changed, 2 insertions(+), 29 deletions(-)

diff --git a/y4minput.c b/y4minput.c
index f923eda34a..9a4bdbd7b5 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -285,26 +285,6 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
   }
 }
 
-/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/
-static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
-                                         unsigned char *_aux) {
-  int c_w;
-  int c_h;
-  int c_sz;
-  int pli;
-  /*Skip past the luma data.*/
-  _dst += _y4m->pic_w * _y4m->pic_h;
-  /*Compute the size of each chroma plane.*/
-  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
-  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
-  c_sz = c_w * c_h;
-  for (pli = 1; pli < 3; pli++) {
-    y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h);
-    _dst += c_sz;
-    _aux += c_sz;
-  }
-}
-
 /*This format is only used for interlaced content, but is included for
    completeness.
 
@@ -889,7 +869,8 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
   y4m_ctx->aux_buf = NULL;
   y4m_ctx->dst_buf = NULL;
   if (strcmp(y4m_ctx->chroma_type, "420") == 0 ||
-      strcmp(y4m_ctx->chroma_type, "420jpeg") == 0) {
+      strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) {
     y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
         y4m_ctx->dst_c_dec_v = 2;
     y4m_ctx->dst_buf_read_sz =
@@ -934,14 +915,6 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
       fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) {
-    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
-        y4m_ctx->dst_c_dec_v = 2;
-    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
-    /*Chroma filter required: read into the aux buf first.*/
-    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
-        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
-    y4m_ctx->convert = y4m_convert_42xmpeg2_42xjpeg;
   } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) {
     y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
         y4m_ctx->dst_c_dec_v = 2;

From f9b565f7ecebb5f76c8d406e35dba1bd25a6398d Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 12 Jul 2021 14:04:12 -0700
Subject: [PATCH 129/926] Refactor rtc rate control test

Remove golden files. Run actual encoding as the ground truth.

Change-Id: I1cea001278c1e9409bb02d33823cf69192c790a4
---
 libs.mk                   |   6 +
 test/ratectrl_rtc_test.cc | 418 ++++++++++++++++++++++----------------
 test/test-data.mk         |   4 -
 test/test-data.sha1       |   4 -
 test/test.mk              |   5 +
 vp9/ratectrl_rtc.h        |  17 +-
 6 files changed, 264 insertions(+), 190 deletions(-)

diff --git a/libs.mk b/libs.mk
index d05eee966d..f5b43abadc 100644
--- a/libs.mk
+++ b/libs.mk
@@ -493,10 +493,12 @@ TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\
                            $(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
 TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
 
+ifeq ($(CONFIG_VP9_ENCODER),yes)
 RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX)
 RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\
                        $(call enabled,RC_INTERFACE_TEST_SRCS))
 RC_INTERFACE_TEST_OBJS := $(sort $(call objs,$(RC_INTERFACE_TEST_SRCS)))
+endif
 
 SIMPLE_ENCODE_TEST_BIN=./test_simple_encode$(EXE_SFX)
 SIMPLE_ENCODE_TEST_SRCS=$(call addprefix_clean,test/,\
@@ -597,6 +599,7 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_
             -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
 endif  # TEST_INTRA_PRED_SPEED
 
+ifeq ($(CONFIG_VP9_ENCODER),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX)
 test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
@@ -616,6 +619,7 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
             -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^
 endif  # RC_INTERFACE_TEST
+endif  # CONFIG_VP9_ENCODER
 endif
 else
 
@@ -657,6 +661,7 @@ $(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \
               -L. -lvpx -lgtest $(extralibs) -lm))
 endif  # TEST_INTRA_PRED_SPEED
 
+ifeq ($(CONFIG_VP9_ENCODER),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 $(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \
   CXXFLAGS += $(GTEST_INCLUDES)
@@ -668,6 +673,7 @@ $(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \
               $(RC_INTERFACE_TEST_OBJS) \
               -L. -lvpx -lgtest -lvp9rc $(extralibs) -lm))
 endif  # RC_INTERFACE_TEST
+endif  # CONFIG_VP9_ENCODER
 
 ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),)
 $(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \
diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 5e5a179b29..1414377082 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -16,6 +16,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/video_source.h"
 #include "vpx/vpx_codec.h"
@@ -23,188 +24,89 @@
 
 namespace {
 
-const size_t kNumFrame = 850;
+const size_t kNumFrames = 300;
 
-struct FrameInfo {
-  friend std::istream &operator>>(std::istream &is, FrameInfo &info) {
-    is >> info.frame_id >> info.spatial_id >> info.temporal_id >> info.base_q >>
-        info.target_bandwidth >> info.buffer_level >> info.filter_level_ >>
-        info.bytes_used;
-    return is;
-  }
-
-  int frame_id;
-  int spatial_id;
-  int temporal_id;
-  // Base QP
-  int base_q;
-  size_t target_bandwidth;
-  size_t buffer_level;
-  // Loopfilter level
-  int filter_level_;
-  // Frame size for current frame, used for pose encode update
-  size_t bytes_used;
-};
+const int kTemporalId[4] = { 0, 2, 1, 2 };
 
-// This test runs the rate control interface and compare against ground truth
-// generated by encoders.
-// Settings for the encoder:
-// For 1 layer CBR:
-// - AQ_Mode 0
-// - Disable golden refresh
-// - Bitrate x 2 at frame/superframe 200
-// - Bitrate / 4 at frame/superframe 400
-// examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9
-//    1280 720 1 30 7 0 0 1 0 1000
-//
-// For 1 layer VBR:
-//  - Set rc_end_usage to VPX_VBR
-//  - AQ Mode 0
-//  - Disable vp9_compute_frame_low_motion in vp9_encoder.c
-//  - Set rc->constrain_gf_key_freq_onepass_vbr = 0 in vp9_rc_init
-// examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9
-//    1280 720 1 30 7 0 0 1 0 1000
-//
-// For SVC (3 temporal layers, 3 spatial layers):
-// - AQ_Mode 0
-// - Disable golden refresh
-// - Bitrate x 2 at frame/superframe 200
-// - Bitrate / 4 at frame/superframe 400
-// examples/vp9_spatial_svc_encoder -f 10000 -w 1280 -h 720 -t 1/30 -sl 3
-// -k 10000 -bl 100,140,200,250,350,500,450,630,900 -b 1600 --rc-end-usage=1
-// --lag-in-frames=0 --passes=1 --speed=7 --threads=1
-// --temporal-layering-mode=3 -aq 1 -rcstat 1
-// gipsrec_motion1.1280_720.yuv -o out.webm
-//
-// The generated file includes:
-// frame number, spatial layer ID, temporal layer ID, base QP, target
-// bandwidth, buffer level, loopfilter level, encoded frame size
-// TODO(jianj): Remove golden files, and run actual encoding in this test.
-class RcInterfaceTest : public ::testing::Test {
+class RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, vpx_rc_mode> {
  public:
-  explicit RcInterfaceTest() {}
+  RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        encoder_exit_(false) {}
 
   virtual ~RcInterfaceTest() {}
 
  protected:
-  void RunOneLayerCBR() {
-    SetConfigOneLayerCBR();
-    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    FrameInfo frame_info;
-    libvpx::VP9FrameParamsQpRTC frame_params;
-    frame_params.frame_type = KEY_FRAME;
-    frame_params.spatial_layer_id = 0;
-    frame_params.temporal_layer_id = 0;
-    std::ifstream one_layer_file;
-    one_layer_file.open(libvpx_test::GetDataPath() +
-                        "/rc_interface_test_one_layer");
-    ASSERT_TRUE(one_layer_file.good());
-    for (size_t i = 0; i < kNumFrame; i++) {
-      one_layer_file >> frame_info;
-      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      if (frame_info.frame_id == 200) {
-        rc_cfg_.target_bandwidth = rc_cfg_.target_bandwidth * 2;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      } else if (frame_info.frame_id == 400) {
-        rc_cfg_.target_bandwidth = rc_cfg_.target_bandwidth / 4;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      }
-      ASSERT_EQ(frame_info.spatial_id, 0);
-      ASSERT_EQ(frame_info.temporal_id, 0);
-      rc_api_->ComputeQP(frame_params);
-      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
-      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
-      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
     }
+    frame_params_.frame_type =
+        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
+      // Disable golden frame update.
+      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+    }
+    encoder_exit_ = video->frame() == kNumFrames;
   }
 
-  void RunSVC() {
-    SetConfigSVC();
-    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    FrameInfo frame_info;
-    libvpx::VP9FrameParamsQpRTC frame_params;
-    frame_params.frame_type = KEY_FRAME;
-    std::ifstream svc_file;
-    svc_file.open(libvpx_test::GetDataPath() + "/rc_interface_test_svc");
-    ASSERT_TRUE(svc_file.good());
-    for (size_t i = 0; i < kNumFrame * rc_cfg_.ss_number_layers; i++) {
-      svc_file >> frame_info;
-      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      if (frame_info.frame_id == 200 * rc_cfg_.ss_number_layers) {
-        for (int layer = 0;
-             layer < rc_cfg_.ss_number_layers * rc_cfg_.ts_number_layers;
-             layer++)
-          rc_cfg_.layer_target_bitrate[layer] *= 2;
-        rc_cfg_.target_bandwidth *= 2;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      } else if (frame_info.frame_id == 400 * rc_cfg_.ss_number_layers) {
-        for (int layer = 0;
-             layer < rc_cfg_.ss_number_layers * rc_cfg_.ts_number_layers;
-             layer++)
-          rc_cfg_.layer_target_bitrate[layer] /= 4;
-        rc_cfg_.target_bandwidth /= 4;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      }
-      frame_params.spatial_layer_id = frame_info.spatial_id;
-      frame_params.temporal_layer_id = frame_info.temporal_id;
-      rc_api_->ComputeQP(frame_params);
-      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
-      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
-      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    if (encoder_exit_) {
+      return;
     }
+    int loopfilter_level, qp;
+    encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    rc_api_->ComputeQP(frame_params_);
+    ASSERT_EQ(rc_api_->GetQP(), qp);
+    ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
   }
 
-  void RunOneLayerVBR() {
-    SetConfigOneLayerVBR();
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  }
+
+  void RunOneLayer() {
+    SetConfig(GET_PARAM(2));
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    FrameInfo frame_info;
-    libvpx::VP9FrameParamsQpRTC frame_params;
-    frame_params.frame_type = KEY_FRAME;
-    frame_params.spatial_layer_id = 0;
-    frame_params.temporal_layer_id = 0;
-    std::ifstream one_layer_file;
-    one_layer_file.open(libvpx_test::GetDataPath() +
-                        "/rc_interface_test_one_layer_vbr");
-    ASSERT_TRUE(one_layer_file.good());
-    for (size_t i = 0; i < kNumFrame; i++) {
-      one_layer_file >> frame_info;
-      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      ASSERT_EQ(frame_info.spatial_id, 0);
-      ASSERT_EQ(frame_info.temporal_id, 0);
-      rc_api_->ComputeQP(frame_params);
-      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
-      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
-      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
-    }
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
   void RunOneLayerVBRPeriodicKey() {
-    SetConfigOneLayerVBRPeriodicKey();
+    if (GET_PARAM(2) != VPX_VBR) return;
+    key_interval_ = 100;
+    SetConfig(VPX_VBR);
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    FrameInfo frame_info;
-    libvpx::VP9FrameParamsQpRTC frame_params;
-    frame_params.frame_type = KEY_FRAME;
-    frame_params.spatial_layer_id = 0;
-    frame_params.temporal_layer_id = 0;
-    std::ifstream one_layer_file;
-    one_layer_file.open(libvpx_test::GetDataPath() +
-                        "/rc_interface_test_one_layer_vbr_periodic_key");
-    ASSERT_TRUE(one_layer_file.good());
-    for (size_t i = 0; i < kNumFrame; i++) {
-      one_layer_file >> frame_info;
-      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      if (frame_info.frame_id % 300 == 0) frame_params.frame_type = KEY_FRAME;
-      ASSERT_EQ(frame_info.spatial_id, 0);
-      ASSERT_EQ(frame_info.temporal_id, 0);
-      rc_api_->ComputeQP(frame_params);
-      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
-      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
-      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
-    }
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
  private:
-  void SetConfig() {
+  void SetConfig(vpx_rc_mode rc_mode) {
     rc_cfg_.width = 1280;
     rc_cfg_.height = 720;
     rc_cfg_.max_quantizer = 52;
@@ -224,24 +126,182 @@ class RcInterfaceTest : public ::testing::Test {
     rc_cfg_.layer_target_bitrate[0] = 1000;
     rc_cfg_.max_quantizers[0] = 52;
     rc_cfg_.min_quantizers[0] = 2;
+    rc_cfg_.rc_mode = rc_mode;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 52;
+    cfg_.rc_end_usage = rc_mode;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
   }
 
-  void SetConfigOneLayerCBR() {
-    SetConfig();
-    rc_cfg_.rc_mode = VPX_CBR;
+  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
+  libvpx::VP9RateControlRtcConfig rc_cfg_;
+  int aq_mode_;
+  int key_interval_;
+  libvpx::VP9FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+};
+
+class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
+                           public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)) {}
+  virtual ~RcInterfaceSvcTest() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
   }
 
-  void SetConfigOneLayerVBR() {
-    SetConfig();
-    rc_cfg_.rc_mode = VPX_VBR;
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
+      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+    }
+
+    frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME;
+    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
+      // Disable golden frame update.
+      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+    }
+    encoder_exit_ = video->frame() == kNumFrames;
+    current_superframe_ = video->frame();
   }
 
-  void SetConfigOneLayerVBRPeriodicKey() {
-    SetConfig();
-    rc_cfg_.rc_mode = VPX_VBR;
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz);
+      for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+        frame_params_.spatial_layer_id = sl;
+        frame_params_.temporal_layer_id = kTemporalId[current_superframe_ % 4];
+        rc_api_->ComputeQP(frame_params_);
+        frame_params_.frame_type = INTER_FRAME;
+        rc_api_->PostEncodeUpdate(sizes_[sl]);
+      }
+    }
+    if (!encoder_exit_) {
+      int loopfilter_level, qp;
+      encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
+      encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+    }
+  }
+  // This method needs to be overridden because non-reference frames are
+  // expected to be mismatched frames as the encoder will avoid loopfilter on
+  // these frames.
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {}
+
+  void RunSvc() {
+    SetConfigSvc();
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderSvc();
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  vpx_codec_err_t ParseSuperframeSizes(const uint8_t *data, size_t data_sz) {
+    uint8_t marker = *(data + data_sz - 1);
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+      for (uint32_t i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (uint32_t j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+        sizes_[i] = this_sz;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
+  void SetEncoderSvc() {
+    cfg_.ss_number_layers = 3;
+    cfg_.ts_number_layers = 3;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 30;
+    svc_params_.scaling_factor_num[0] = 72;
+    svc_params_.scaling_factor_den[0] = 288;
+    svc_params_.scaling_factor_num[1] = 144;
+    svc_params_.scaling_factor_den[1] = 288;
+    svc_params_.scaling_factor_num[2] = 288;
+    svc_params_.scaling_factor_den[2] = 288;
+    for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
+      svc_params_.max_quantizers[i] = 56;
+      svc_params_.min_quantizers[i] = 2;
+    }
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    // 3 temporal layers
+    cfg_.ts_rate_decimator[0] = 4;
+    cfg_.ts_rate_decimator[1] = 2;
+    cfg_.ts_rate_decimator[2] = 1;
+    cfg_.temporal_layering_mode = 3;
+
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.g_threads = 1;
+    cfg_.kf_max_dist = 9999;
+    cfg_.rc_target_bitrate = 1600;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_undershoot_pct = 50;
+
+    cfg_.layer_target_bitrate[0] = 100;
+    cfg_.layer_target_bitrate[1] = 140;
+    cfg_.layer_target_bitrate[2] = 200;
+    cfg_.layer_target_bitrate[3] = 250;
+    cfg_.layer_target_bitrate[4] = 350;
+    cfg_.layer_target_bitrate[5] = 500;
+    cfg_.layer_target_bitrate[6] = 450;
+    cfg_.layer_target_bitrate[7] = 630;
+    cfg_.layer_target_bitrate[8] = 900;
   }
 
-  void SetConfigSVC() {
+  void SetConfigSvc() {
     rc_cfg_.width = 1280;
     rc_cfg_.height = 720;
     rc_cfg_.max_quantizer = 56;
@@ -288,17 +348,25 @@ class RcInterfaceTest : public ::testing::Test {
     }
   }
 
+  int aq_mode_;
   std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
   libvpx::VP9RateControlRtcConfig rc_cfg_;
+  vpx_svc_extra_cfg_t svc_params_;
+  libvpx::VP9FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+  int current_superframe_;
+  uint32_t sizes_[8];
 };
 
-TEST_F(RcInterfaceTest, OneLayerCBR) { RunOneLayerCBR(); }
+TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
-TEST_F(RcInterfaceTest, OneLayerVBR) { RunOneLayerVBR(); }
+TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
-TEST_F(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
+TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
-TEST_F(RcInterfaceTest, SVC) { RunSVC(); }
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0),
+                           ::testing::Values(VPX_CBR, VPX_VBR));
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0));
 }  // namespace
 
 int main(int argc, char **argv) {
diff --git a/test/test-data.mk b/test/test-data.mk
index 379fc6e7a9..46fe359898 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -27,10 +27,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer_vbr
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer_vbr_periodic_key
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_svc
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv
 
 # Test vectors
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index bcf9612fba..668992fba2 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -869,7 +869,3 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv
 518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
 ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
-03f827c0e36ff9a6e23c5cc11936924e4f1827ab *rc_interface_test_one_layer
-99e4f4c2961d46dc286db230090a39d78460b25d *rc_interface_test_svc
-9dcaafd91bc61ed360c23616b4788437b9f9b96b *rc_interface_test_one_layer_vbr
-babd17cca2e93cc74753c6ed80de87457bc3a5f3 *rc_interface_test_one_layer_vbr_periodic_key
diff --git a/test/test.mk b/test/test.mk
index b0319fb0de..11228ecdda 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -214,6 +214,11 @@ TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
 
 RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) := ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes += codec_factory.h
 
 endif # CONFIG_SHARED
 
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index f219f24500..4e0cb8b4c0 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -117,13 +117,16 @@ class VP9RateControlRTC {
       const VP9RateControlRtcConfig &cfg);
   ~VP9RateControlRTC() {
     if (cpi_) {
-      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
-        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
-          int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
-          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
-          vpx_free(lc->map);
-          vpx_free(lc->last_coded_q_map);
-          vpx_free(lc->consec_zero_mv);
+      if (cpi_->svc.number_spatial_layers > 1 ||
+          cpi_->svc.number_temporal_layers > 1) {
+        for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+          for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+            int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
+            LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+            vpx_free(lc->map);
+            vpx_free(lc->last_coded_q_map);
+            vpx_free(lc->consec_zero_mv);
+          }
         }
       }
       vpx_free(cpi_);

From 6b4b82fd7a47720d608f6349bdb2cb2b81adb6a1 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 15 Jul 2021 16:05:16 -0700
Subject: [PATCH 130/926] Use round to be more accurate casting float to int

Change-Id: Ifd5961917831752b176dd75d39d6b2cba6ce72fa
---
 vp8/encoder/onyx_if.c              | 6 +++---
 vp9/encoder/vp9_svc_layercontext.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index aeed719d1f..71ef057a4a 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -301,9 +301,9 @@ static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
   /* Work out the average size of a frame within this layer */
   if (layer > 0) {
     lc->avg_frame_size_for_layer =
-        (int)((cpi->oxcf.target_bitrate[layer] -
-               cpi->oxcf.target_bitrate[layer - 1]) *
-              1000 / (lc->framerate - prev_layer_framerate));
+        (int)round((cpi->oxcf.target_bitrate[layer] -
+                    cpi->oxcf.target_bitrate[layer - 1]) *
+                   1000 / (lc->framerate - prev_layer_framerate));
   }
 
   lc->active_worst_quality = cpi->oxcf.worst_allowed_q;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index f9a0de62a0..ad3a8f7afa 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -322,8 +322,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
     const int prev_layer_target_bandwidth =
         oxcf->layer_target_bitrate[st_idx - 1];
     lc->avg_frame_size =
-        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
-              (lc->framerate - prev_layer_framerate));
+        (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+                   (lc->framerate - prev_layer_framerate));
   }
 }
 

From cd260eba10b9155f6e6086c999bd4c9d3ca6c706 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 29 Jun 2021 14:48:35 -0700
Subject: [PATCH 131/926] Add cyclic refresh to vp9 rtc external ratecontrol

Change-Id: Ia2a881399aa31ca0f34481b975362ddd4ad87f1c
---
 test/ratectrl_rtc_test.cc          |  3 ++-
 vp9/encoder/vp9_aq_cyclicrefresh.c |  3 ++-
 vp9/ratectrl_rtc.cc                | 27 ++++++++++++++++++++++++++-
 vp9/ratectrl_rtc.h                 | 10 ++++++++++
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 1414377082..8136bd8b93 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -127,6 +127,7 @@ class RcInterfaceTest
     rc_cfg_.max_quantizers[0] = 52;
     rc_cfg_.min_quantizers[0] = 2;
     rc_cfg_.rc_mode = rc_mode;
+    rc_cfg_.aq_mode = aq_mode_;
 
     // Encoder settings for ground truth.
     cfg_.g_w = 1280;
@@ -364,7 +365,7 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
-VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0),
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0));
 }  // namespace
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index e6edf5a925..f06fe47268 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -516,7 +516,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->rate_ratio_qdelta = 3.0;
   } else {
     cr->rate_ratio_qdelta = 2.0;
-    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+    if (cr->content_mode && cpi->noise_estimate.enabled &&
+        cpi->noise_estimate.level >= kMedium) {
       // Reduce the delta-qp if the estimated source noise is above threshold.
       cr->rate_ratio_qdelta = 1.7;
       cr->rate_boost_fac = 13;
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index b38a0db9c0..0f56e67e80 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -11,6 +11,7 @@
 
 #include <new>
 
+#include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vpx/vp8cx.h"
@@ -28,6 +29,15 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
     return nullptr;
   }
   rc_api->InitRateControl(cfg);
+  if (cfg.aq_mode) {
+    VP9_COMP *const cpi = rc_api->cpi_;
+    cpi->segmentation_map = static_cast<uint8_t *>(
+        vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
+                   sizeof(*cpi->segmentation_map)));
+    cpi->cyclic_refresh =
+        vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
+    cpi->cyclic_refresh->content_mode = 0;
+  }
   return rc_api;
 }
 
@@ -42,13 +52,14 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   oxcf->bit_depth = cm->bit_depth;
   oxcf->rc_mode = rc_cfg.rc_mode;
   oxcf->pass = 0;
-  oxcf->aq_mode = NO_AQ;
+  oxcf->aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
   oxcf->content = VP9E_CONTENT_DEFAULT;
   oxcf->drop_frames_water_mark = 0;
   cm->current_video_frame = 0;
   rc->kf_boost = DEFAULT_KF_BOOST;
 
   UpdateRateControl(rc_cfg);
+  vp9_set_mb_mi(cm, cm->width, cm->height);
 
   cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 ||
                    cpi_->svc.number_temporal_layers > 1)
@@ -146,6 +157,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
       cpi_->svc.number_temporal_layers == 1) {
     int target = 0;
     if (cpi_->oxcf.rc_mode == VPX_CBR) {
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+        vp9_cyclic_refresh_update_parameters(cpi_);
       if (frame_is_intra_only(cm))
         target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_);
       else
@@ -156,6 +169,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
         cpi_->rc.frames_to_key = cpi_->oxcf.key_freq;
       }
       vp9_set_gf_update_one_pass_vbr(cpi_);
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+        vp9_cyclic_refresh_update_parameters(cpi_);
       if (frame_is_intra_only(cm))
         target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_);
       else
@@ -171,6 +186,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
   int bottom_index, top_index;
   cpi_->common.base_qindex =
       vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index);
+
+  if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_);
 }
 
 int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; }
@@ -181,6 +198,14 @@ int VP9RateControlRTC::GetLoopfilterLevel() const {
   return lf->filter_level;
 }
 
+signed char *VP9RateControlRTC::GetCyclicRefreshMap() const {
+  return cpi_->cyclic_refresh->map;
+}
+
+int *VP9RateControlRTC::GetDeltaQ() const {
+  return cpi_->cyclic_refresh->qindex_delta;
+}
+
 void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
   vp9_rc_postencode_update(cpi_, encoded_frame_size);
   if (cpi_->svc.number_spatial_layers > 1 ||
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index 4e0cb8b4c0..5cc7ec9457 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -18,6 +18,7 @@
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/vp9_iface_common.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/vp9_cx_iface.h"
@@ -42,6 +43,7 @@ struct VP9RateControlRtcConfig {
     framerate = 30.0;
     ss_number_layers = ts_number_layers = 1;
     rc_mode = VPX_CBR;
+    aq_mode = 0;
     vp9_zero(max_quantizers);
     vp9_zero(min_quantizers);
     vp9_zero(scaling_factor_den);
@@ -82,6 +84,7 @@ struct VP9RateControlRtcConfig {
   int ts_rate_decimator[VPX_TS_MAX_LAYERS];
   // vbr, cbr
   enum vpx_rc_mode rc_mode;
+  int aq_mode;
 };
 
 struct VP9FrameParamsQpRTC {
@@ -129,6 +132,11 @@ class VP9RateControlRTC {
           }
         }
       }
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+        vpx_free(cpi_->segmentation_map);
+        cpi_->segmentation_map = NULL;
+        vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
+      }
       vpx_free(cpi_);
     }
   }
@@ -137,6 +145,8 @@ class VP9RateControlRTC {
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
   int GetLoopfilterLevel() const;
+  signed char *GetCyclicRefreshMap() const;
+  int *GetDeltaQ() const;
   void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
   void PostEncodeUpdate(uint64_t encoded_frame_size);

From 4a4ea28a3826d6a9843369bb8880cf49dc7c5dc0 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 21 Jul 2021 14:32:27 -0700
Subject: [PATCH 132/926] Add control to get QP for all spatial layers

Change-Id: I77a9884351e71649c8f8632293d9515c60f6adbc
---
 vp9/encoder/vp9_encoder.c          |  4 ++++
 vp9/encoder/vp9_svc_layercontext.h |  2 ++
 vp9/vp9_cx_iface.c                 | 12 ++++++++++++
 vpx/vp8cx.h                        | 13 +++++++++++++
 4 files changed, 31 insertions(+)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f50b979979..c964eb68bb 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3709,6 +3709,10 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
     cpi->rc.force_max_q = 0;
   }
 
+  if (cpi->use_svc) {
+    cpi->svc.base_qindex[cpi->svc.spatial_layer_id] = *q;
+  }
+
   if (!frame_is_intra_only(cm)) {
     vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
   }
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index b12e7e01a7..b2d1d1b98f 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -173,6 +173,8 @@ typedef struct SVC {
   uint8_t fb_idx_temporal_layer_id[REF_FRAMES];
 
   int spatial_layer_sync[VPX_SS_MAX_LAYERS];
+  // Quantizer for each spatial layer.
+  int base_qindex[VPX_SS_MAX_LAYERS];
   uint8_t set_intra_only_frame;
   uint8_t previous_frame_is_intra_only;
   uint8_t superframe_has_layer_sync;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 906f2b0b85..0da54d2d0d 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -834,6 +834,17 @@ static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_quantizer_svc_layers(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  int *const arg = va_arg(args, int *);
+  int i;
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  for (i = 0; i < VPX_SS_MAX_LAYERS; i++) {
+    arg[i] = ctx->cpi->svc.base_qindex[i];
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const arg = va_arg(args, int *);
@@ -1988,6 +1999,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, ctrl_get_quantizer_svc_layers },
   { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level },
   { VP9_GET_REFERENCE, ctrl_get_reference },
   { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id },
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 011dfcba52..7d0dee0b78 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -732,6 +732,16 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP9
    */
   VP9E_GET_LOOPFILTER_LEVEL,
+
+  /*!\brief Codec control to get last quantizers for all spatial layers.
+   *
+   * Return value uses an array of internal quantizers scale defined by the
+   * codec, for all spatial layers.
+   * The size of the array passed in should be #VPX_SS_MAX_LAYERS.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LAST_QUANTIZER_SVC_LAYERS,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -989,6 +999,9 @@ VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
 #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
 #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
+VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
+#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
+
 VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
 #define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
 

From cf64eb2805c73f4feaac3fec2b890d2fb3eef7da Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Fri, 23 Jul 2021 10:55:10 -0700
Subject: [PATCH 133/926] Disable allow_partition_search_skip feature

This feature was added to help speed up still images and slideshows.
It didn't work anymore, and thus was disabled. Code cleanup will
follow.

This had negligible impact to regular test sets. Borg test result
on ugc360p set at speed 3.
  avg_psnr:  ovr_psnr:  ssim:    speed:
   -0.244    -0.278    -0.153    -0.973

Change-Id: If74edabce0c93be1361e645ffd2eec063c2db76b
---
 vp9/encoder/vp9_speed_features.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index fc7a67c9f1..1431446d9e 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -345,7 +345,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
-    sf->allow_partition_search_skip = 1;
+    sf->allow_partition_search_skip = 0;
 
     if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
       for (i = 0; i < MAX_MESH_STEP; ++i) {

From 7c00f0ce18811dbe6d538ebda9fec7339fed3a90 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Fri, 23 Jul 2021 22:34:01 -0700
Subject: [PATCH 134/926] Clean up allow_partition_search_skip code

Change-Id: Ia05157fc3e613d93f10df5abddd77a740a0005ca
---
 vp9/encoder/vp9_encodeframe.c    | 22 ++-------------------
 vp9/encoder/vp9_encoder.c        |  1 -
 vp9/encoder/vp9_encoder.h        |  3 ---
 vp9/encoder/vp9_firstpass.c      | 33 --------------------------------
 vp9/encoder/vp9_speed_features.c |  2 --
 vp9/encoder/vp9_speed_features.h |  3 ---
 6 files changed, 2 insertions(+), 62 deletions(-)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 969fad59b1..3c54aa548d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -160,6 +160,7 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if !CONFIG_REALTIME_ONLY
+#if CONFIG_FP_MB_STATS
 static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
                                                    const struct buf_2d *ref,
                                                    int mi_row, int mi_col,
@@ -174,20 +175,8 @@ static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
   var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
+#endif
 
-static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
-                                                   int mi_row, int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(
-      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
-  if (var < 8)
-    return BLOCK_64X64;
-  else if (var < 128)
-    return BLOCK_32X32;
-  else if (var < 2048)
-    return BLOCK_16X16;
-  else
-    return BLOCK_8X8;
-}
 #endif  // !CONFIG_REALTIME_ONLY
 
 static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
@@ -4705,13 +4694,6 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, td->pc_root);
-    } else if (cpi->partition_search_skippable_frame) {
-      BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
-      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
                cm->frame_type != KEY_FRAME) {
       choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f50b979979..f1a96e8b01 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2323,7 +2323,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   vp9_init_rd_parameters(cpi);
 
   init_frame_indexes(cm);
-  cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
 
   realloc_segmentation_maps(cpi);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index ea2d59e1b5..7a3f354bcd 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -711,9 +711,6 @@ typedef struct VP9_COMP {
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
 
-  // For a still frame, this flag is set to 1 to skip partition search.
-  int partition_search_skippable_frame;
-
   int scaled_ref_idx[REFS_PER_FRAME];
   int lst_fb_idx;
   int gld_fb_idx;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 7343d1bc66..e30e2bed01 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3477,25 +3477,6 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   }
 }
 
-static int is_skippable_frame(const VP9_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const TWO_PASS *const twopass = &cpi->twopass;
-
-  return (!frame_is_intra_only(&cpi->common) &&
-          twopass->stats_in - 2 > twopass->stats_in_start &&
-          twopass->stats_in < twopass->stats_in_end &&
-          (twopass->stats_in - 1)->pcnt_inter -
-                  (twopass->stats_in - 1)->pcnt_motion ==
-              1 &&
-          (twopass->stats_in - 2)->pcnt_inter -
-                  (twopass->stats_in - 2)->pcnt_motion ==
-              1 &&
-          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
 // Configure image size specific vizier parameters.
 // Later these will be set via additional command line options
 void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) {
@@ -3593,13 +3574,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
     cm->frame_type = INTER_FRAME;
 
-    // Do the firstpass stats indicate that this frame is skippable for the
-    // partition search?
-    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-        !cpi->use_svc) {
-      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-    }
-
     // The multiplication by 256 reverses a scaling factor of (>> 8)
     // applied when combining MB error values for the frame.
     twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
@@ -3682,13 +3656,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   vp9_configure_buffer_updates(cpi, gf_group->index);
 
-  // Do the firstpass stats indicate that this frame is skippable for the
-  // partition search?
-  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-      !cpi->use_svc) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-  }
-
   rc->base_frame_target = gf_group->bit_allocation[gf_group->index];
 
   // The multiplication by 256 reverses a scaling factor of (>> 8)
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 1431446d9e..81695e9156 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -345,7 +345,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
-    sf->allow_partition_search_skip = 0;
 
     if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
       for (i = 0; i < MAX_MESH_STEP; ++i) {
@@ -931,7 +930,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->max_delta_qindex = 0;
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
-  sf->allow_partition_search_skip = 0;
   sf->allow_txfm_domain_distortion = 0;
   sf->tx_domain_thresh = 99.0;
   sf->allow_quant_coeff_opt = sf->optimize_coefficients;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 5ea04709ec..c2ae970b77 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -525,9 +525,6 @@ typedef struct SPEED_FEATURES {
     int prune_rect_thresh[4];
   } rd_ml_partition;
 
-  // Allow skipping partition search for still image frame
-  int allow_partition_search_skip;
-
   // Fast approximation of vp9_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
 

From 0973ac05bae529e54d9e04e0f69fa4e58869d923 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Fri, 23 Jul 2021 22:45:45 -0700
Subject: [PATCH 135/926] Remove unused old FP_MB_STATS code

Change-Id: I78ac1f8ce1598de295efd2ac1fe8244072d9b501
---
 vp9/encoder/vp9_encodeframe.c | 200 ----------------------------------
 vp9/encoder/vp9_encoder.c     |  30 -----
 vp9/encoder/vp9_encoder.h     |   8 --
 vp9/encoder/vp9_firstpass.c   |  86 ---------------
 vp9/encoder/vp9_firstpass.h   |  27 -----
 vp9/vp9_cx_iface.c            |   9 --
 vpxenc.c                      |  51 ---------
 7 files changed, 411 deletions(-)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3c54aa548d..f08300976e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -159,26 +159,6 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if !CONFIG_REALTIME_ONLY
-#if CONFIG_FP_MB_STATS
-static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
-                                                   const struct buf_2d *ref,
-                                                   int mi_row, int mi_col,
-                                                   BLOCK_SIZE bs) {
-  unsigned int sse, var;
-  uint8_t *last_y;
-  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
-
-  assert(last != NULL);
-  last_y =
-      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
-  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
-}
-#endif
-
-#endif  // !CONFIG_REALTIME_ONLY
-
 static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int segment_index) {
   VP9_COMMON *const cm = &cpi->common;
@@ -3110,54 +3090,6 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
-#if CONFIG_FP_MB_STATS
-const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        1, 2, 2, 2, 4, 4 };
-const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        2, 1, 2, 4, 2, 4 };
-const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { 0,   10,  10, 30, 40,
-                                                        40,  60,  80, 80, 90,
-                                                        100, 100, 120 };
-const int qindex_split_threshold_lookup[BLOCK_SIZES] = { 0,  3,  3,  7,  15,
-                                                         15, 30, 40, 40, 60,
-                                                         80, 80, 120 };
-const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { 1, 1, 1, 1, 1,
-                                                             1, 1, 1, 1, 1,
-                                                             4, 4, 6 };
-
-typedef enum {
-  MV_ZERO = 0,
-  MV_LEFT = 1,
-  MV_UP = 2,
-  MV_RIGHT = 3,
-  MV_DOWN = 4,
-  MV_INVALID
-} MOTION_DIRECTION;
-
-static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
-  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
-    return MV_ZERO;
-  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
-    return MV_LEFT;
-  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
-    return MV_RIGHT;
-  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
-    return MV_UP;
-  } else {
-    return MV_DOWN;
-  }
-}
-
-static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
-                                           MOTION_DIRECTION that_mv) {
-  if (this_mv == that_mv) {
-    return 0;
-  } else {
-    return abs(this_mv - that_mv) == 2 ? 2 : 1;
-  }
-}
-#endif
-
 // Calculate prediction based on the given input features and neural net config.
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
@@ -4055,11 +3987,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   BLOCK_SIZE min_size = x->min_partition_size;
   BLOCK_SIZE max_size = x->max_partition_size;
 
-#if CONFIG_FP_MB_STATS
-  unsigned int src_diff_var = UINT_MAX;
-  int none_complexity = 0;
-#endif
-
   int partition_none_allowed = !force_horz_split && !force_vert_split;
   int partition_horz_allowed =
       !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
@@ -4146,65 +4073,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
-                                                  mi_col, bsize);
-  }
-#endif
-
-#if CONFIG_FP_MB_STATS
-  // Decide whether we shall split directly and skip searching NONE by using
-  // the first pass block statistics
-  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split &&
-      partition_none_allowed && src_diff_var > 4 &&
-      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
-    int mb_row = mi_row >> 1;
-    int mb_col = mi_col >> 1;
-    int mb_row_end =
-        VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-    int mb_col_end =
-        VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-    int r, c;
-
-    // compute a complexity measure, basically measure inconsistency of motion
-    // vectors obtained from the first pass in the current block
-    for (r = mb_row; r < mb_row_end; r++) {
-      for (c = mb_col; c < mb_col_end; c++) {
-        const int mb_index = r * cm->mb_cols + c;
-
-        MOTION_DIRECTION this_mv;
-        MOTION_DIRECTION right_mv;
-        MOTION_DIRECTION bottom_mv;
-
-        this_mv =
-            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
-
-        // to its right
-        if (c != mb_col_end - 1) {
-          right_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
-          none_complexity += get_motion_inconsistency(this_mv, right_mv);
-        }
-
-        // to its bottom
-        if (r != mb_row_end - 1) {
-          bottom_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
-          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
-        }
-
-        // do not count its left and top neighbors to avoid double counting
-      }
-    }
-
-    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
-      partition_none_allowed = 0;
-    }
-  }
-#endif
-
   pc_tree->partitioning = PARTITION_NONE;
 
   if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) {
@@ -4282,53 +4150,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
             }
           }
         }
-
-#if CONFIG_FP_MB_STATS
-        // Check if every 16x16 first pass block statistics has zero
-        // motion and the corresponding first pass residue is small enough.
-        // If that is the case, check the difference variance between the
-        // current frame and the last frame. If the variance is small enough,
-        // stop further splitting in RD optimization
-        if (cpi->use_fp_mb_stats && do_split != 0 &&
-            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
-          int mb_row = mi_row >> 1;
-          int mb_col = mi_col >> 1;
-          int mb_row_end =
-              VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-          int mb_col_end =
-              VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-          int r, c;
-
-          int skip = 1;
-          for (r = mb_row; r < mb_row_end; r++) {
-            for (c = mb_col; c < mb_col_end; c++) {
-              const int mb_index = r * cm->mb_cols + c;
-              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_MOTION_ZERO_MASK) ||
-                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_ERROR_SMALL_MASK)) {
-                skip = 0;
-                break;
-              }
-            }
-            if (skip == 0) {
-              break;
-            }
-          }
-
-          if (skip) {
-            if (src_diff_var == UINT_MAX) {
-              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-              src_diff_var = get_sby_perpixel_diff_variance(
-                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
-            }
-            if (src_diff_var < 8) {
-              do_split = 0;
-              do_rect = 0;
-            }
-          }
-        }
-#endif
       }
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -6064,20 +5885,6 @@ static void encode_tiles(VP9_COMP *cpi) {
       vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
 }
 
-#if CONFIG_FP_MB_STATS
-static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
-                            VP9_COMMON *cm, uint8_t **this_frame_mb_stats) {
-  uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
-                         cm->current_video_frame * cm->MBs * sizeof(uint8_t);
-
-  if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
-
-  *this_frame_mb_stats = mb_stats_in;
-
-  return 1;
-}
-#endif
-
 static int compare_kmeans_data(const void *a, const void *b) {
   if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) {
     return 1;
@@ -6284,13 +6091,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
 
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
-                       &cpi->twopass.this_frame_mb_stats);
-    }
-#endif
-
     if (!cpi->row_mt) {
       cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
       cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f1a96e8b01..9001d018f3 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2363,17 +2363,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
         vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
-#if CONFIG_FP_MB_STATS
-  cpi->use_fp_mb_stats = 0;
-  if (cpi->use_fp_mb_stats) {
-    // a place holder used to store the first pass mb stats in the first pass
-    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
-                    vpx_calloc(cm->MBs * sizeof(uint8_t), 1));
-  } else {
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
-
   cpi->refresh_alt_ref_frame = 0;
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 
@@ -2526,18 +2515,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
       vp9_init_second_pass_spatial_svc(cpi);
     } else {
       int num_frames;
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        const size_t psz = cpi->common.MBs * sizeof(uint8_t);
-        const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
-
-        cpi->twopass.firstpass_mb_stats.mb_stats_start =
-            oxcf->firstpass_mb_stats_in.buf;
-        cpi->twopass.firstpass_mb_stats.mb_stats_end =
-            cpi->twopass.firstpass_mb_stats.mb_stats_start +
-            (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
-      }
-#endif
 
       cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
       cpi->twopass.stats_in = cpi->twopass.stats_in_start;
@@ -2841,13 +2818,6 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
     vpx_free(cpi->mbgraph_stats[i].mb_stats);
   }
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vpx_free(cpi->twopass.frame_mb_stats_buf);
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
-
   vp9_extrc_delete(&cpi->ext_ratectrl);
 
   vp9_remove_common(cm);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 7a3f354bcd..9774a64ccf 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -273,10 +273,6 @@ typedef struct VP9EncoderConfig {
 
   vpx_fixed_buf_t two_pass_stats_in;
 
-#if CONFIG_FP_MB_STATS
-  vpx_fixed_buf_t firstpass_mb_stats_in;
-#endif
-
   vp8e_tuning tuning;
   vp9e_tune_content content;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -803,10 +799,6 @@ typedef struct VP9_COMP {
   uint64_t time_pick_lpf;
   uint64_t time_encode_sb_row;
 
-#if CONFIG_FP_MB_STATS
-  int use_fp_mb_stats;
-#endif
-
   TWO_PASS twopass;
 
   // Force recalculation of segment_ids for each mode info
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e30e2bed01..67302ed035 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -135,17 +135,6 @@ static void output_stats(FIRSTPASS_STATS *stats) {
 #endif
 }
 
-#if CONFIG_FP_MB_STATS
-static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
-                              struct vpx_codec_pkt_list *pktlist) {
-  struct vpx_codec_cx_pkt pkt;
-  pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
-  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
-  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
-  vpx_codec_pkt_list_add(pktlist, &pkt);
-}
-#endif
-
 static void zero_stats(FIRSTPASS_STATS *section) {
   section->frame = 0.0;
   section->weight = 0.0;
@@ -953,10 +942,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     int level_sample;
     const int mb_index = mb_row * cm->mb_cols + mb_col;
 
-#if CONFIG_FP_MB_STATS
-    const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
-
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
 
     // Adjust to the next column of MBs.
@@ -1092,13 +1077,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     // Accumulate the intra error.
     fp_acc_data->intra_error += (int64_t)this_error;
 
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      // initialization
-      cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-    }
-#endif
-
     // Set up limit values for motion vectors to prevent them extending
     // outside the UMV borders.
     x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
@@ -1244,20 +1222,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
       best_ref_mv->row = 0;
       best_ref_mv->col = 0;
 
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        // intra prediction statistics
-        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
-        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-        if (this_error > FPMB_ERROR_LARGE_TH) {
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-        } else if (this_error < FPMB_ERROR_SMALL_TH) {
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-        }
-      }
-#endif
-
       if (motion_error <= this_error) {
         vpx_clear_system_state();
 
@@ -1302,47 +1266,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
 
         *best_ref_mv = mv;
 
-#if CONFIG_FP_MB_STATS
-        if (cpi->use_fp_mb_stats) {
-          // inter prediction statistics
-          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-          if (this_error > FPMB_ERROR_LARGE_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-          } else if (this_error < FPMB_ERROR_SMALL_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-          }
-        }
-#endif
-
         if (!is_zero_mv(&mv)) {
           ++(fp_acc_data->mvcount);
 
-#if CONFIG_FP_MB_STATS
-          if (cpi->use_fp_mb_stats) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK;
-            // check estimated motion direction
-            if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
-              // right direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_RIGHT_MASK;
-            } else if (mv.as_mv.row < 0 &&
-                       abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
-              // up direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK;
-            } else if (mv.as_mv.col < 0 &&
-                       abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
-              // left direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_LEFT_MASK;
-            } else {
-              // down direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_DOWN_MASK;
-            }
-          }
-#endif
           // Does the row vector point inwards or outwards?
           if (mb_row < cm->mb_rows / 2) {
             if (mv.row > 0)
@@ -1459,12 +1385,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
-  }
-#endif
-
   set_first_pass_params(cpi);
   vp9_set_quantizer(cpi, find_fp_qindex(cm->bit_depth));
 
@@ -1525,12 +1445,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     twopass->this_frame_stats = fps;
     output_stats(&twopass->this_frame_stats);
     accumulate_stats(&twopass->total_stats, &fps);
-
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list);
-    }
-#endif
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index ddfc87d894..cdcf568723 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -21,27 +21,6 @@
 extern "C" {
 #endif
 
-#if CONFIG_FP_MB_STATS
-
-#define FPMB_DCINTRA_MASK 0x01
-
-#define FPMB_MOTION_ZERO_MASK 0x02
-#define FPMB_MOTION_LEFT_MASK 0x04
-#define FPMB_MOTION_RIGHT_MASK 0x08
-#define FPMB_MOTION_UP_MASK 0x10
-#define FPMB_MOTION_DOWN_MASK 0x20
-
-#define FPMB_ERROR_SMALL_MASK 0x40
-#define FPMB_ERROR_LARGE_MASK 0x80
-#define FPMB_ERROR_SMALL_TH 2000
-#define FPMB_ERROR_LARGE_TH 48000
-
-typedef struct {
-  uint8_t *mb_stats_start;
-  uint8_t *mb_stats_end;
-} FIRSTPASS_MB_STATS;
-#endif
-
 #define INVALID_ROW (-1)
 
 #define MAX_ARF_LAYERS 6
@@ -188,12 +167,6 @@ typedef struct {
   double mb_av_energy;
   double mb_smooth_pct;
 
-#if CONFIG_FP_MB_STATS
-  uint8_t *frame_mb_stats_buf;
-  uint8_t *this_frame_mb_stats;
-  FIRSTPASS_MB_STATS firstpass_mb_stats;
-#endif
-
   FP_MB_FLOAT_STATS *fp_mb_float_stats;
 
   // An indication of the content type of the current frame
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 906f2b0b85..7d927ba80b 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -583,10 +583,6 @@ static vpx_codec_err_t set_encoder_config(
 
   vp9_set_first_pass_stats(oxcf, &cfg->rc_twopass_stats_in);
 
-#if CONFIG_FP_MB_STATS
-  oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
-#endif
-
   oxcf->color_space = extra_cfg->color_space;
   oxcf->color_range = extra_cfg->color_range;
   oxcf->render_width = extra_cfg->render_width;
@@ -2293,11 +2289,6 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) {
   DUMP_STRUCT_VALUE(fp, oxcf, target_level);
 
   // TODO(angiebird): dump two_pass_stats_in
-
-#if CONFIG_FP_MB_STATS
-  // TODO(angiebird): dump firstpass_mb_stats_in
-#endif
-
   DUMP_STRUCT_VALUE(fp, oxcf, tuning);
   DUMP_STRUCT_VALUE(fp, oxcf, content);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpxenc.c b/vpxenc.c
index 276ee9b902..a0122ef804 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -114,10 +114,6 @@ static const arg_def_t pass_arg =
     ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
 static const arg_def_t fpf_name =
     ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
-#if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name =
-    ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name");
-#endif
 static const arg_def_t limit =
     ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
 static const arg_def_t skip =
@@ -674,9 +670,6 @@ struct stream_config {
   struct vpx_codec_enc_cfg cfg;
   const char *out_fn;
   const char *stats_fn;
-#if CONFIG_FP_MB_STATS
-  const char *fpmb_stats_fn;
-#endif
   stereo_format_t stereo_fmt;
   int arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int arg_ctrl_cnt;
@@ -704,9 +697,6 @@ struct stream_state {
   uint64_t cx_time;
   size_t nbytes;
   stats_io_t stats;
-#if CONFIG_FP_MB_STATS
-  stats_io_t fpmb_stats;
-#endif
   struct vpx_image *img;
   vpx_codec_ctx_t decoder;
   int mismatch_seen;
@@ -943,10 +933,6 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->out_fn = arg.val;
     } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
-#if CONFIG_FP_MB_STATS
-    } else if (arg_match(&arg, &fpmbf_name, argi)) {
-      config->fpmb_stats_fn = arg.val;
-#endif
     } else if (arg_match(&arg, &use_webm, argi)) {
 #if CONFIG_WEBM_IO
       config->write_webm = 1;
@@ -1169,17 +1155,6 @@ static void validate_stream_config(const struct stream_state *stream,
         fatal("Stream %d: duplicate stats file (from stream %d)",
               streami->index, stream->index);
     }
-
-#if CONFIG_FP_MB_STATS
-    /* Check for two streams sharing a mb stats file. */
-    if (streami != stream) {
-      const char *a = stream->config.fpmb_stats_fn;
-      const char *b = streami->config.fpmb_stats_fn;
-      if (a && b && !strcmp(a, b))
-        fatal("Stream %d: duplicate mb stats file (from stream %d)",
-              streami->index, stream->index);
-    }
-#endif
   }
 }
 
@@ -1338,26 +1313,11 @@ static void setup_pass(struct stream_state *stream,
       fatal("Failed to open statistics store");
   }
 
-#if CONFIG_FP_MB_STATS
-  if (stream->config.fpmb_stats_fn) {
-    if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn,
-                         pass))
-      fatal("Failed to open mb statistics store");
-  } else {
-    if (!stats_open_mem(&stream->fpmb_stats, pass))
-      fatal("Failed to open mb statistics store");
-  }
-#endif
-
   stream->config.cfg.g_pass = global->passes == 2
                                   ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
                                   : VPX_RC_ONE_PASS;
   if (pass) {
     stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
-#if CONFIG_FP_MB_STATS
-    stream->config.cfg.rc_firstpass_mb_stats_in =
-        stats_get(&stream->fpmb_stats);
-#endif
   }
 
   stream->cx_time = 0;
@@ -1569,13 +1529,6 @@ static void get_cx_data(struct stream_state *stream,
                     pkt->data.twopass_stats.sz);
         stream->nbytes += pkt->data.raw.sz;
         break;
-#if CONFIG_FP_MB_STATS
-      case VPX_CODEC_FPMB_STATS_PKT:
-        stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf,
-                    pkt->data.firstpass_mb_stats.sz);
-        stream->nbytes += pkt->data.raw.sz;
-        break;
-#endif
       case VPX_CODEC_PSNR_PKT:
 
         if (global->show_psnr) {
@@ -2069,10 +2022,6 @@ int main(int argc, const char **argv_) {
 
     FOREACH_STREAM(stats_close(&stream->stats, global.passes - 1));
 
-#if CONFIG_FP_MB_STATS
-    FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1));
-#endif
-
     if (global.pass) break;
   }
 

From fc04a9491ebaaa8e2b1c7c8e0587c8a1873531d6 Mon Sep 17 00:00:00 2001
From: Peter Kasting <pkasting@chromium.org>
Date: Mon, 26 Jul 2021 03:57:55 -0700
Subject: [PATCH 136/926] Fix some instances of -Wunused-but-set-variable.

Bug: chromium:1203071
Change-Id: Ieb628f95d676ba3814b5caf8a02a884330928c77
---
 vp8/encoder/bitstream.c       |  3 ---
 vp9/encoder/vp9_encodeframe.c | 18 +++---------------
 vpx_ports/x86.h               | 12 ++++++++++++
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 80cbb882fd..87825fa6fe 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -866,7 +866,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
   vp8_writer *const w = cpi->bc;
 #endif
-  int savings = 0;
 
   vpx_clear_system_state();
 
@@ -940,8 +939,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
             vp8_write_literal(w, newp, 8);
 #endif
-
-            savings += s;
           }
 
         } while (++t < ENTROPY_NODES);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f08300976e..131c4887f2 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -784,8 +784,8 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
 
 // Check if most of the superblock is skin content, and if so, force split to
 // 32x32, and set x->sb_is_skin for use in mode selection.
-static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
-                         int mi_row, int mi_col, int *force_split) {
+static int skin_sb_split(VP9_COMP *cpi, const int low_res, int mi_row,
+                         int mi_col, int *force_split) {
   VP9_COMMON *const cm = &cpi->common;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) return 0;
@@ -797,11 +797,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
                    mi_row + 8 < cm->mi_rows)) {
     int num_16x16_skin = 0;
     int num_16x16_nonskin = 0;
-    uint8_t *ysignal = x->plane[0].src.buf;
-    uint8_t *usignal = x->plane[1].src.buf;
-    uint8_t *vsignal = x->plane[2].src.buf;
-    int sp = x->plane[0].src.stride;
-    int spuv = x->plane[1].src.stride;
     const int block_index = mi_row * cm->mi_cols + mi_col;
     const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
     const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
@@ -820,13 +815,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
           i = ymis;
           break;
         }
-        ysignal += 16;
-        usignal += 8;
-        vsignal += 8;
       }
-      ysignal += (sp << 4) - 64;
-      usignal += (spuv << 3) - 32;
-      vsignal += (spuv << 3) - 32;
     }
     if (num_16x16_skin > 12) {
       *force_split = 1;
@@ -1503,8 +1492,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     if (cpi->use_skin_detection)
-      x->sb_is_skin =
-          skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
+      x->sb_is_skin = skin_sb_split(cpi, low_res, mi_row, mi_col, force_split);
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 14f4344495..ad3da84aca 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -223,6 +223,8 @@ static INLINE int x86_simd_caps(void) {
     }
   }
 
+  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
+
   return flags & mask;
 }
 
@@ -307,6 +309,11 @@ static INLINE unsigned int x86_readtscp(void) {
 static INLINE unsigned int x86_tsc_start(void) {
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return x86_readtsc();
 }
 
@@ -314,6 +321,11 @@ static INLINE unsigned int x86_tsc_end(void) {
   uint32_t v = x86_readtscp();
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return v;
 }
 

From 0d1aec7373b6e43825281f0a4f9d40df77323e0a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 26 Jul 2021 16:52:56 -0700
Subject: [PATCH 137/926] vpx_ports/x86.h: sync with aom_ports/x86.h

adds a few comments and makes the file ascii:
854b2766a Replace non-ASCII characters

Change-Id: I6c2d76b293158bcad9f1ded7a91a81bda1e700fb
---
 vpx_ports/x86.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index ad3da84aca..4d5391b78d 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -242,7 +242,7 @@ static INLINE int x86_simd_caps(void) {
 // x86_readtsc directly, but prevent the CPU's out-of-order execution from
 // affecting the measurement (by having earlier/later instructions be evaluated
 // in the time interval). See the white paper, "How to Benchmark Code
-// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by
+// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by
 // Gabriele Paoloni for more information.
 //
 // If you are timing a large function (CPU time > a couple of seconds), use
@@ -308,6 +308,7 @@ static INLINE unsigned int x86_readtscp(void) {
 
 static INLINE unsigned int x86_tsc_start(void) {
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
   // Avoid compiler warnings on unused-but-set variables.
   (void)reg_eax;
@@ -320,6 +321,7 @@ static INLINE unsigned int x86_tsc_start(void) {
 static INLINE unsigned int x86_tsc_end(void) {
   uint32_t v = x86_readtscp();
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
   // Avoid compiler warnings on unused-but-set variables.
   (void)reg_eax;

From f685d508da549b0eccfc455c04e2b6fbc3eeb251 Mon Sep 17 00:00:00 2001
From: Hirokazu Honda <hiroh@chromium.org>
Date: Fri, 30 Jul 2021 02:42:35 +0900
Subject: [PATCH 138/926] vp9 rc: Fills VP9_COMP zero at initialization

Change-Id: Ib1a544ce87e8fdbe23c0e54b6426ee228011b126
---
 vp9/ratectrl_rtc.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 0f56e67e80..6446120f5b 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -25,9 +25,9 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
                                                 VP9RateControlRTC());
   if (!rc_api) return nullptr;
   rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
-  if (rc_api->cpi_ == nullptr) {
-    return nullptr;
-  }
+  if (!rc_api->cpi_) return nullptr;
+  vp9_zero(*rc_api->cpi_);
+
   rc_api->InitRateControl(cfg);
   if (cfg.aq_mode) {
     VP9_COMP *const cpi = rc_api->cpi_;

From 59c9e1d87ef33bc82fca82cfcf5202d4b86c92e7 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 24 Aug 2021 14:30:54 -0700
Subject: [PATCH 139/926] vp9 rc lib: Allow aq 3 to work for SVC with unit test

Also use round to cast float to int with more accurate calculation to
avoid error accumulation which causes qp to be different after ~290
frames.

Change-Id: Iff65a8fdc67401814fd253dbf148afe9887df97f
---
 test/ratectrl_rtc_test.cc  | 4 +++-
 vp9/encoder/vp9_ratectrl.c | 4 ++--
 vp9/ratectrl_rtc.cc        | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 8136bd8b93..22bc5ecf74 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -270,6 +270,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
       svc_params_.max_quantizers[i] = 56;
       svc_params_.min_quantizers[i] = 2;
+      svc_params_.speed_per_layer[i] = 7;
     }
     cfg_.rc_end_usage = VPX_CBR;
     cfg_.g_lag_in_frames = 0;
@@ -318,6 +319,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     rc_cfg_.ss_number_layers = 3;
     rc_cfg_.ts_number_layers = 3;
     rc_cfg_.rc_mode = VPX_CBR;
+    rc_cfg_.aq_mode = aq_mode_;
 
     rc_cfg_.scaling_factor_num[0] = 1;
     rc_cfg_.scaling_factor_den[0] = 4;
@@ -367,7 +369,7 @@ TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
-VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0));
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));
 }  // namespace
 
 int main(int argc, char **argv) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index d0d83a8342..e38464c72c 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -277,9 +277,9 @@ static void update_buffer_level_svc_preencode(VP9_COMP *cpi) {
         svc->current_superframe > 0) {
       // TODO(marpan): This may need to be modified for temporal layers.
       const double framerate_pts = 10000000.0 / ts_delta;
-      lrc->bits_off_target += (int)(lc->target_bandwidth / framerate_pts);
+      lrc->bits_off_target += (int)round(lc->target_bandwidth / framerate_pts);
     } else {
-      lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+      lrc->bits_off_target += (int)round(lc->target_bandwidth / lc->framerate);
     }
     // Clip buffer level to maximum buffer size for the layer.
     lrc->bits_off_target =
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 6446120f5b..76ff367c06 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -105,7 +105,7 @@ void VP9RateControlRTC::UpdateRateControl(
   cpi_->framerate = rc_cfg.framerate;
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
-
+  vp9_set_mb_mi(cm, cm->width, cm->height);
   for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
     for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
       const int layer =

From ee73384f0304c7e8a84a214ddc8863d40fe716ad Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 2 Sep 2021 16:15:13 -0700
Subject: [PATCH 140/926] Add codec control for vp8 external rc

disable cyclic refresh

Change-Id: I7905602919d5780831fad840577e97730ce0afc2
---
 vp8/vp8_cx_iface.c | 11 +++++++++++
 vpx/vp8cx.h        | 14 ++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 78631e7976..20e18aee78 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -605,6 +605,16 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP8_COMP *cpi = ctx->cpi;
+  const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+  if (data) {
+    cpi->cyclic_refresh_mode_enabled = 0;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
                                          void **mem_loc) {
   vpx_codec_err_t res = VPX_CODEC_OK;
@@ -1243,6 +1253,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct },
   { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode },
   { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+  { VP8E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { -1, NULL },
 };
 
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 7d0dee0b78..17afac754b 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -742,6 +742,17 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP9
    */
   VP9E_GET_LAST_QUANTIZER_SVC_LAYERS,
+
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will turn off cyclic refresh for vp8.
+   *
+   * With those, the rate control is expected to work exactly the same as the
+   * interface provided in vp8_ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_RTC_EXTERNAL_RATECTRL,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -1107,6 +1118,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int)
 VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL
 
+VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
+
 VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
 #define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
 

From ca40ca9bed87687eb0b534bf3974c95182dd29a1 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 8 Sep 2021 16:52:51 -0700
Subject: [PATCH 141/926] vp8 rc: always update correction factor

Change-Id: Id40b9cb5a85a15fb313a2a93f14f6768259f7c15
---
 vp8/encoder/onyx_if.c  | 4 +++-
 vp8/encoder/onyx_int.h | 4 ++++
 vp8/vp8_cx_iface.c     | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 71ef057a4a..57c94071b0 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1910,6 +1910,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->force_maxqp = 0;
   cpi->frames_since_last_drop_overshoot = 0;
+  cpi->rt_always_update_correction_factor = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -4445,7 +4446,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     }
   }
 
-  if (!active_worst_qchanged) vp8_update_rate_correction_factors(cpi, 2);
+  if (cpi->rt_always_update_correction_factor || !active_worst_qchanged)
+    vp8_update_rate_correction_factors(cpi, 2);
 
   cpi->last_q[cm->frame_type] = cm->base_qindex;
 
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index b96f9b1dc5..a29994a135 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -702,6 +702,10 @@ typedef struct VP8_COMP {
   int use_roi_static_threshold;
 
   int ext_refresh_frame_flags_pending;
+
+  // Always update correction factor used for rate control after each frame for
+  // realtime encoding.
+  int rt_always_update_correction_factor;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 20e18aee78..893b7a5132 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -611,6 +611,7 @@ static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
   const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
   if (data) {
     cpi->cyclic_refresh_mode_enabled = 0;
+    cpi->rt_always_update_correction_factor = 1;
   }
   return VPX_CODEC_OK;
 }

From 65a1751e5b98bf7f1d21bcbfdef352af34fb205d Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 31 Aug 2021 10:22:22 -0700
Subject: [PATCH 142/926] Add vp8 support to rc lib

For 1 layer CBR only.
Support for temporal layers comes later.

Rename the library to libvpxrc

Bug: b/188853141

Change-Id: Ib7f977b64c05b1a0596870cb7f8e6768cb483850
---
 libs.mk                                       |  54 ++--
 test/test.mk                                  |   8 +-
 test/test_rc_interface.cc                     |   6 +
 test/vp8_ratectrl_rtc_test.cc                 | 180 +++++++++++
 ...l_rtc_test.cc => vp9_ratectrl_rtc_test.cc} |   5 -
 vp8/encoder/onyx_if.c                         |   6 +-
 vp8/encoder/ratectrl.c                        |   3 +-
 vp8/vp8_ratectrl_rtc.cc                       | 288 ++++++++++++++++++
 vp8/vp8_ratectrl_rtc.h                        |  62 ++++
 vp9/ratectrl_rtc.h                            |  41 +--
 vpx/internal/vpx_ratectrl_rtc.h               |  62 ++++
 vpx/vp8cx.h                                   |   2 +-
 vpx/vpx_codec.mk                              |   1 +
 13 files changed, 646 insertions(+), 72 deletions(-)
 create mode 100644 test/test_rc_interface.cc
 create mode 100644 test/vp8_ratectrl_rtc_test.cc
 rename test/{ratectrl_rtc_test.cc => vp9_ratectrl_rtc_test.cc} (99%)
 create mode 100644 vp8/vp8_ratectrl_rtc.cc
 create mode 100644 vp8/vp8_ratectrl_rtc.h
 create mode 100644 vpx/internal/vpx_ratectrl_rtc.h

diff --git a/libs.mk b/libs.mk
index f5b43abadc..d4763efca0 100644
--- a/libs.mk
+++ b/libs.mk
@@ -94,15 +94,28 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h
   CODEC_DOC_SECTIONS += vp9 vp9_encoder
+endif
 
-  RC_RTC_SRCS := $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
-  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
-  RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h
+RC_RTC_SRCS := vpx/vp8.h vpx/vp8cx.h
+RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h
+RC_RTC_SRCS += vpx/internal/vpx_ratectrl_rtc.h
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+  VP9_PREFIX=vp9/
+  RC_RTC_SRCS += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
+  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk
   RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc
   RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h
   INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc
   INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.h
 endif
+ifeq ($(CONFIG_VP8_ENCODER),yes)
+  VP8_PREFIX=vp8/
+  RC_RTC_SRCS += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+endif
 
 ifeq ($(CONFIG_VP9_DECODER),yes)
   VP9_PREFIX=vp9/
@@ -126,7 +139,7 @@ endif
 ifeq ($(CONFIG_MSVS),yes)
 CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
 GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
-RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vp9rcmt,vp9rcmd)
+RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxrcmt,vpxrcmd)
 # This variable uses deferred expansion intentionally, since the results of
 # $(wildcard) may change during the course of the Make.
 VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
@@ -249,16 +262,16 @@ PROJECTS-yes += vpx.$(VCPROJ_SFX)
 vpx.$(VCPROJ_SFX): vpx_config.asm
 vpx.$(VCPROJ_SFX): $(RTCD)
 
-vp9rc.$(VCPROJ_SFX): \
+vpxrc.$(VCPROJ_SFX): \
     VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^)
 
-vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
+vpxrc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             $(if $(CONFIG_SHARED),--dll,--lib) \
             --target=$(TOOLCHAIN) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=vp9rc \
+            --name=vpxrc \
             --proj-guid=C26FF952-9494-4838-9A3F-7F3D4F613385 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
@@ -275,10 +288,10 @@ vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
               $(VCPROJ_SRCS)) \
             --src-path-bare="$(SRC_PATH_BARE)" \
 
-PROJECTS-yes += vp9rc.$(VCPROJ_SFX)
+PROJECTS-yes += vpxrc.$(VCPROJ_SFX)
 
-vp9rc.$(VCPROJ_SFX): vpx_config.asm
-vp9rc.$(VCPROJ_SFX): $(RTCD)
+vpxrc.$(VCPROJ_SFX): vpx_config.asm
+vpxrc.$(VCPROJ_SFX): $(RTCD)
 
 endif # ifeq ($(CONFIG_MSVS),yes)
 else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
@@ -398,12 +411,11 @@ INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
 INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
 CLEAN-OBJS += vpx.pc
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
-  RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
+ifeq ($(CONFIG_ENCODERS),yes)
   RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
   OBJS-yes += $(RC_RTC_OBJS)
-  LIBS-yes += $(BUILD_PFX)libvp9rc.a $(BUILD_PFX)libvp9rc_g.a
-  $(BUILD_PFX)libvp9rc_g.a: $(RC_RTC_OBJS)
+  LIBS-yes += $(BUILD_PFX)libvpxrc.a $(BUILD_PFX)libvpxrc_g.a
+  $(BUILD_PFX)libvpxrc_g.a: $(RC_RTC_OBJS)
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes)
@@ -493,7 +505,7 @@ TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\
                            $(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
 TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
+ifeq ($(CONFIG_ENCODERS),yes)
 RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX)
 RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\
                        $(call enabled,RC_INTERFACE_TEST_SRCS))
@@ -599,11 +611,11 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_
             -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
 endif  # TEST_INTRA_PRED_SPEED
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
+ifeq ($(CONFIG_ENCODERS),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX)
 test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
-	vp9rc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	vpxrc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             --exe \
@@ -661,19 +673,19 @@ $(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \
               -L. -lvpx -lgtest $(extralibs) -lm))
 endif  # TEST_INTRA_PRED_SPEED
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
+ifeq ($(CONFIG_ENCODERS),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 $(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \
   CXXFLAGS += $(GTEST_INCLUDES)
 OBJS-yes += $(RC_INTERFACE_TEST_OBJS)
 BINS-yes += $(RC_INTERFACE_TEST_BIN)
 
-$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvp9rc.a
+$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvpxrc.a
 $(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \
               $(RC_INTERFACE_TEST_OBJS) \
-              -L. -lvpx -lgtest -lvp9rc $(extralibs) -lm))
+              -L. -lvpx -lgtest -lvpxrc $(extralibs) -lm))
 endif  # RC_INTERFACE_TEST
-endif  # CONFIG_VP9_ENCODER
+endif  # CONFIG_ENCODERS
 
 ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),)
 $(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \
diff --git a/test/test.mk b/test/test.mk
index 11228ecdda..41dfd5d835 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -213,9 +213,11 @@ endif
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
 
-RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) := ratectrl_rtc_test.cc
-RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.cc
-RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.h
 RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc
 RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h
 RC_INTERFACE_TEST_SRCS-yes += codec_factory.h
diff --git a/test/test_rc_interface.cc b/test/test_rc_interface.cc
new file mode 100644
index 0000000000..ec75700f73
--- /dev/null
+++ b/test/test_rc_interface.cc
@@ -0,0 +1,6 @@
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
new file mode 100644
index 0000000000..d5032b38e7
--- /dev/null
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <fstream>  // NOLINT
+#include <string>
+
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+struct Vp8RCTestVideo {
+  Vp8RCTestVideo() {}
+  Vp8RCTestVideo(const char *name_, int width_, int height_,
+                 unsigned int frames_)
+      : name(name_), width(width_), height(height_), frames(frames_) {}
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const Vp8RCTestVideo &video) {
+    os << video.name << " " << video.width << " " << video.height << " "
+       << video.frames;
+    return os;
+  }
+  const char *name;
+  int width;
+  int height;
+  unsigned int frames;
+};
+
+const Vp8RCTestVideo kVp8RCTestVectors[] = {
+  Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470),
+  Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300),
+};
+
+class Vp8RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, Vp8RCTestVideo> {
+ public:
+  Vp8RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {}
+  virtual ~Vp8RcInterfaceTest() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, -6);
+      encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+    }
+    frame_params_.frame_type =
+        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    if (frame_params_.frame_type == INTER_FRAME) {
+      // Disable golden frame update.
+      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+    }
+    encoder_exit_ = video->frame() == test_video_.frames;
+  }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    if (encoder_exit_) {
+      return;
+    }
+    int qp;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    rc_api_->ComputeQP(frame_params_);
+    ASSERT_EQ(rc_api_->GetQP(), qp);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  }
+
+  void RunOneLayer() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunPeriodicKey() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    key_interval_ = 100;
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  void SetConfig() {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.layer_target_bitrate[0] = target_bitrate_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+  }
+
+  std::unique_ptr<libvpx::VP8RateControlRTC> rc_api_;
+  libvpx::VP8RateControlRtcConfig rc_cfg_;
+  int key_interval_;
+  int target_bitrate_;
+  Vp8RCTestVideo test_video_;
+  libvpx::VP8FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+};
+
+TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
+
+VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest,
+                           ::testing::Values(200, 400, 1000),
+                           ::testing::ValuesIn(kVp8RCTestVectors));
+
+}  // namespace
diff --git a/test/ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
similarity index 99%
rename from test/ratectrl_rtc_test.cc
rename to test/vp9_ratectrl_rtc_test.cc
index 22bc5ecf74..b09a45bb76 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -371,8 +371,3 @@ VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));
 }  // namespace
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 57c94071b0..fc154afd14 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -4017,7 +4017,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
     /* Are we are overshooting and up against the limit of active max Q. */
-    if (((cpi->pass != 2) ||
+    if (!cpi->rt_always_update_correction_factor &&
+        ((cpi->pass != 2) ||
          (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
         (Q == cpi->active_worst_quality) &&
         (cpi->active_worst_quality < cpi->worst_quality) &&
@@ -4446,8 +4447,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     }
   }
 
-  if (cpi->rt_always_update_correction_factor || !active_worst_qchanged)
-    vp8_update_rate_correction_factors(cpi, 2);
+  if (!active_worst_qchanged) vp8_update_rate_correction_factors(cpi, 2);
 
   cpi->last_q[cm->frame_type] = cm->base_qindex;
 
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index d2b8dff06a..4b76cc6429 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -327,7 +327,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
     int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
     /* Boost depends somewhat on frame rate: only used for 1 layer case. */
     if (cpi->oxcf.number_of_layers == 1) {
-      kf_boost = VPXMAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+      kf_boost =
+          VPXMAX(initial_boost, (int)round(2 * cpi->output_framerate - 16));
     } else {
       /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
       kf_boost = initial_boost;
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
new file mode 100644
index 0000000000..c42ab971e2
--- /dev/null
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -0,0 +1,288 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <new>
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vp8/encoder/ratectrl.h"
+
+namespace libvpx {
+/* Quant MOD */
+static const int kQTrans[] = {
+  0,  1,  2,  3,  4,  5,  7,   8,   9,   10,  12,  13,  15,  17,  18,  19,
+  20, 21, 23, 24, 25, 26, 27,  28,  29,  30,  31,  33,  35,  37,  39,  41,
+  43, 45, 47, 49, 51, 53, 55,  57,  59,  61,  64,  67,  70,  73,  76,  79,
+  82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,
+  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  8,  8,  8,  8,  9,  9,  10, 10,
+  10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16,
+  16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+  22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30
+};
+
+static const unsigned char inter_minq[QINDEX_RANGE] = {
+  0,  0,  1,  1,  2,  3,  3,  4,  4,  5,  6,  6,  7,  8,  8,  9,  9,  10, 11,
+  11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24,
+  24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+  39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53,
+  54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69,
+  70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86,
+  87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
+};
+
+static int rescale(int val, int num, int denom) {
+  int64_t llnum = num;
+  int64_t llden = denom;
+  int64_t llval = val;
+
+  return (int)(llval * llnum / llden);
+}
+
+std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
+    const VP8RateControlRtcConfig &cfg) {
+  std::unique_ptr<VP8RateControlRTC> rc_api(new (std::nothrow)
+                                                VP8RateControlRTC());
+  if (!rc_api) return nullptr;
+  rc_api->cpi_ = static_cast<VP8_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
+  if (!rc_api->cpi_) return nullptr;
+  vp8_zero(*rc_api->cpi_);
+
+  rc_api->InitRateControl(cfg);
+
+  return rc_api;
+}
+
+void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+  cpi_->pass = 0;
+  cm->show_frame = 1;
+  oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  cpi_->auto_gold = 1;
+  cpi_->key_frame_count = 1;
+  cpi_->rate_correction_factor = 1.0;
+  cpi_->key_frame_rate_correction_factor = 1.0;
+  cpi_->cyclic_refresh_mode_enabled = 0;
+  cpi_->auto_worst_q = 1;
+  cpi_->kf_overspend_bits = 0;
+  cpi_->kf_bitrate_adjustment = 0;
+  cpi_->gf_overspend_bits = 0;
+  cpi_->non_gf_bitrate_adjustment = 0;
+  UpdateRateControl(rc_cfg);
+  cpi_->buffer_level = oxcf->starting_buffer_level;
+  cpi_->bits_off_target = oxcf->starting_buffer_level;
+}
+
+void VP8RateControlRTC::UpdateRateControl(
+    const VP8RateControlRtcConfig &rc_cfg) {
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+
+  cm->Width = rc_cfg.width;
+  cm->Height = rc_cfg.height;
+  oxcf->Width = rc_cfg.width;
+  oxcf->Height = rc_cfg.height;
+  oxcf->worst_allowed_q = kQTrans[rc_cfg.max_quantizer];
+  oxcf->best_allowed_q = kQTrans[rc_cfg.min_quantizer];
+  cpi_->worst_quality = oxcf->worst_allowed_q;
+  cpi_->best_quality = oxcf->best_allowed_q;
+  cpi_->output_framerate = rc_cfg.framerate;
+  oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth;
+  oxcf->fixed_q = -1;
+  oxcf->error_resilient_mode = 1;
+  oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level_in_ms = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size_in_ms = rc_cfg.buf_sz;
+  oxcf->starting_buffer_level = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size = rc_cfg.buf_sz;
+  oxcf->number_of_layers = 1;
+  cpi_->buffered_mode = oxcf->optimal_buffer_level > 0;
+  oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
+  oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->framerate = rc_cfg.framerate;
+  for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) {
+    cpi_->prior_key_frame_distance[i] =
+        static_cast<int>(cpi_->output_framerate);
+  }
+
+  cpi_->total_actual_bits = 0;
+  cpi_->total_target_vs_actual = 0;
+
+  cm->mb_rows = cm->Height >> 4;
+  cm->mb_cols = cm->Width >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+  cm->mode_info_stride = cm->mb_cols + 1;
+
+  oxcf->starting_buffer_level =
+      rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
+  /* Set or reset optimal and maximum buffer levels. */
+  if (oxcf->optimal_buffer_level == 0) {
+    oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
+  } else {
+    oxcf->optimal_buffer_level =
+        rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000);
+  }
+  if (oxcf->maximum_buffer_size == 0) {
+    oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
+  } else {
+    oxcf->maximum_buffer_size =
+        rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+  }
+
+  if (cpi_->bits_off_target > oxcf->maximum_buffer_size) {
+    cpi_->bits_off_target = oxcf->maximum_buffer_size;
+    cpi_->buffer_level = cpi_->bits_off_target;
+  }
+
+  vp8_new_framerate(cpi_, cpi_->framerate);
+}
+
+void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
+  VP8_COMMON *const cm = &cpi_->common;
+  cm->frame_type = frame_params.frame_type;
+  cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
+    cpi_->common.frame_flags |= FRAMEFLAGS_KEY;
+  }
+
+  vp8_pick_frame_size(cpi_);
+
+  if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level &&
+      cpi_->buffered_mode) {
+    /* Max adjustment is 1/4 */
+    int Adjustment = cpi_->active_worst_quality / 4;
+    if (Adjustment) {
+      int buff_lvl_step;
+      if (cpi_->buffer_level < cpi_->oxcf.maximum_buffer_size) {
+        buff_lvl_step = (int)((cpi_->oxcf.maximum_buffer_size -
+                               cpi_->oxcf.optimal_buffer_level) /
+                              Adjustment);
+        if (buff_lvl_step) {
+          Adjustment =
+              (int)((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) /
+                    buff_lvl_step);
+        } else {
+          Adjustment = 0;
+        }
+      }
+      cpi_->active_worst_quality -= Adjustment;
+      if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+        cpi_->active_worst_quality = cpi_->active_best_quality;
+      }
+    }
+  }
+
+  if (cpi_->ni_frames > 150) {
+    int q = cpi_->active_worst_quality;
+    if (cm->frame_type == KEY_FRAME) {
+      cpi_->active_best_quality = kf_high_motion_minq[q];
+    } else {
+      cpi_->active_best_quality = inter_minq[q];
+    }
+
+    if (cpi_->buffer_level >= cpi_->oxcf.maximum_buffer_size) {
+      cpi_->active_best_quality = cpi_->best_quality;
+
+    } else if (cpi_->buffer_level > cpi_->oxcf.optimal_buffer_level) {
+      int Fraction =
+          (int)(((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) * 128) /
+                (cpi_->oxcf.maximum_buffer_size -
+                 cpi_->oxcf.optimal_buffer_level));
+      int min_qadjustment =
+          ((cpi_->active_best_quality - cpi_->best_quality) * Fraction) / 128;
+
+      cpi_->active_best_quality -= min_qadjustment;
+    }
+  }
+
+  /* Clip the active best and worst quality values to limits */
+  if (cpi_->active_worst_quality > cpi_->worst_quality) {
+    cpi_->active_worst_quality = cpi_->worst_quality;
+  }
+  if (cpi_->active_best_quality < cpi_->best_quality) {
+    cpi_->active_best_quality = cpi_->best_quality;
+  }
+  if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+    cpi_->active_worst_quality = cpi_->active_best_quality;
+  }
+
+  q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target);
+  vp8_set_quantizer(cpi_, q_);
+}
+
+int VP8RateControlRTC::GetQP() const { return q_; }
+
+void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+  VP8_COMMON *const cm = &cpi_->common;
+
+  cpi_->total_byte_count += encoded_frame_size;
+  cpi_->projected_frame_size = static_cast<int>(encoded_frame_size << 3);
+
+  vp8_update_rate_correction_factors(cpi_, 2);
+
+  cpi_->last_q[cm->frame_type] = cm->base_qindex;
+
+  if (cm->frame_type == KEY_FRAME) {
+    vp8_adjust_key_frame_context(cpi_);
+  }
+
+  /* Keep a record of ambient average Q. */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->avg_frame_qindex =
+        (2 + 3 * cpi_->avg_frame_qindex + cm->base_qindex) >> 2;
+  }
+  /* Keep a record from which we can calculate the average Q excluding
+   * key frames.
+   */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->ni_frames++;
+    /* Damp value for first few frames */
+    if (cpi_->ni_frames > 150) {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi = (cpi_->ni_tot_qi / cpi_->ni_frames);
+    } else {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi =
+          ((cpi_->ni_tot_qi / cpi_->ni_frames) + cpi_->worst_quality + 1) / 2;
+    }
+
+    /* If the average Q is higher than what was used in the last
+     * frame (after going through the recode loop to keep the frame
+     * size within range) then use the last frame value - 1. The -1
+     * is designed to stop Q and hence the data rate, from
+     * progressively falling away during difficult sections, but at
+     * the same time reduce the number of itterations around the
+     * recode loop.
+     */
+    if (q_ > cpi_->ni_av_qi) cpi_->ni_av_qi = q_ - 1;
+  }
+
+  cpi_->bits_off_target +=
+      cpi_->av_per_frame_bandwidth - cpi_->projected_frame_size;
+  if (cpi_->bits_off_target > cpi_->oxcf.maximum_buffer_size) {
+    cpi_->bits_off_target = cpi_->oxcf.maximum_buffer_size;
+  }
+
+  cpi_->total_actual_bits += cpi_->projected_frame_size;
+  cpi_->buffer_level = cpi_->bits_off_target;
+
+  cpi_->common.current_video_frame++;
+  cpi_->frames_since_key++;
+}
+}  // namespace libvpx
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
new file mode 100644
index 0000000000..a1cd52b051
--- /dev/null
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_RATECTRL_RTC_H_
+#define VPX_VP8_RATECTRL_RTC_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/common/common.h"
+#include "vpx/internal/vpx_ratectrl_rtc.h"
+
+namespace libvpx {
+struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
+ public:
+  VP8RateControlRtcConfig() {
+    vp8_zero(layer_target_bitrate);
+    vp8_zero(ts_rate_decimator);
+  }
+};
+
+struct VP8FrameParamsQpRTC {
+  FRAME_TYPE frame_type;
+};
+
+class VP8RateControlRTC {
+ public:
+  static std::unique_ptr<VP8RateControlRTC> Create(
+      const VP8RateControlRtcConfig &cfg);
+  ~VP8RateControlRTC() {
+    if (cpi_) {
+      vpx_free(cpi_->gf_active_flags);
+      vpx_free(cpi_);
+    }
+  }
+
+  void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
+  // GetQP() needs to be called after ComputeQP() to get the latest QP
+  int GetQP() const;
+  // int GetLoopfilterLevel() const;
+  void ComputeQP(const VP8FrameParamsQpRTC &frame_params);
+  // Feedback to rate control with the size of current encoded frame
+  void PostEncodeUpdate(uint64_t encoded_frame_size);
+
+ private:
+  VP8RateControlRTC() {}
+  void InitRateControl(const VP8RateControlRtcConfig &cfg);
+  VP8_COMP *cpi_;
+  int q_;
+};
+
+}  // namespace libvpx
+
+#endif  // VPX_VP8_RATECTRL_RTC_H_
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index 5cc7ec9457..d2b9417aef 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -22,28 +22,14 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/vp9_cx_iface.h"
+#include "vpx/internal/vpx_ratectrl_rtc.h"
 #include "vpx_mem/vpx_mem.h"
 
 namespace libvpx {
 
-struct VP9RateControlRtcConfig {
+struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
  public:
   VP9RateControlRtcConfig() {
-    width = 1280;
-    height = 720;
-    max_quantizer = 63;
-    min_quantizer = 2;
-    target_bandwidth = 1000;
-    buf_initial_sz = 600;
-    buf_optimal_sz = 600;
-    buf_sz = 1000;
-    undershoot_pct = overshoot_pct = 50;
-    max_intra_bitrate_pct = 50;
-    max_inter_bitrate_pct = 0;
-    framerate = 30.0;
-    ss_number_layers = ts_number_layers = 1;
-    rc_mode = VPX_CBR;
-    aq_mode = 0;
     vp9_zero(max_quantizers);
     vp9_zero(min_quantizers);
     vp9_zero(scaling_factor_den);
@@ -52,26 +38,10 @@ struct VP9RateControlRtcConfig {
     vp9_zero(ts_rate_decimator);
     scaling_factor_num[0] = 1;
     scaling_factor_den[0] = 1;
-    layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
     max_quantizers[0] = max_quantizer;
     min_quantizers[0] = min_quantizer;
-    ts_rate_decimator[0] = 1;
   }
 
-  int width;
-  int height;
-  // 0-63
-  int max_quantizer;
-  int min_quantizer;
-  int64_t target_bandwidth;
-  int64_t buf_initial_sz;
-  int64_t buf_optimal_sz;
-  int64_t buf_sz;
-  int undershoot_pct;
-  int overshoot_pct;
-  int max_intra_bitrate_pct;
-  int max_inter_bitrate_pct;
-  double framerate;
   // Number of spatial layers
   int ss_number_layers;
   // Number of temporal layers
@@ -80,11 +50,6 @@ struct VP9RateControlRtcConfig {
   int min_quantizers[VPX_MAX_LAYERS];
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
-  int layer_target_bitrate[VPX_MAX_LAYERS];
-  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
-  // vbr, cbr
-  enum vpx_rc_mode rc_mode;
-  int aq_mode;
 };
 
 struct VP9FrameParamsQpRTC {
@@ -94,7 +59,7 @@ struct VP9FrameParamsQpRTC {
 };
 
 // This interface allows using VP9 real-time rate control without initializing
-// the encoder. To use this interface, you need to link with libvp9rc.a.
+// the encoder. To use this interface, you need to link with libvpxrc.a.
 //
 // #include "vp9/ratectrl_rtc.h"
 // VP9RateControlRTC rc_api;
diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h
new file mode 100644
index 0000000000..0474e0a85b
--- /dev/null
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_RATECTRL_RTC_H_
+#define VPX_VPX_RATECTRL_RTC_H_
+
+#include "vpx/vpx_encoder.h"
+
+namespace libvpx {
+struct VpxRateControlRtcConfig {
+ public:
+  VpxRateControlRtcConfig() {
+    width = 1280;
+    height = 720;
+    max_quantizer = 63;
+    min_quantizer = 2;
+    target_bandwidth = 1000;
+    buf_initial_sz = 600;
+    buf_optimal_sz = 600;
+    buf_sz = 1000;
+    undershoot_pct = overshoot_pct = 50;
+    max_intra_bitrate_pct = 50;
+    max_inter_bitrate_pct = 0;
+    framerate = 30.0;
+    ts_number_layers = 1;
+    rc_mode = VPX_CBR;
+    aq_mode = 0;
+    layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
+    ts_rate_decimator[0] = 1;
+  }
+
+  int width;
+  int height;
+  // 0-63
+  int max_quantizer;
+  int min_quantizer;
+  int64_t target_bandwidth;
+  int64_t buf_initial_sz;
+  int64_t buf_optimal_sz;
+  int64_t buf_sz;
+  int undershoot_pct;
+  int overshoot_pct;
+  int max_intra_bitrate_pct;
+  int max_inter_bitrate_pct;
+  double framerate;
+  // Number of temporal layers
+  int ts_number_layers;
+  int layer_target_bitrate[VPX_MAX_LAYERS];
+  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+  // vbr, cbr
+  enum vpx_rc_mode rc_mode;
+  int aq_mode;
+};
+}  // namespace libvpx
+#endif
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 17afac754b..28bd861747 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -747,7 +747,7 @@ enum vp8e_enc_control_id {
    *
    * This will turn off cyclic refresh for vp8.
    *
-   * With those, the rate control is expected to work exactly the same as the
+   * With this, the rate control is expected to work exactly the same as the
    * interface provided in vp8_ratectrl_rtc.cc/h
    *
    * Supported in codecs: VP8
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index 350dc247bc..de86579d58 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -33,6 +33,7 @@ API_SRCS-yes += vpx_decoder.h
 API_SRCS-yes += src/vpx_encoder.c
 API_SRCS-yes += vpx_encoder.h
 API_SRCS-yes += internal/vpx_codec_internal.h
+API_SRCS-yes += internal/vpx_ratectrl_rtc.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
 API_SRCS-yes += vpx_codec.h

From 8a6fbc0b4eb8538e213782bcdc3969a08b44e73b Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 10 Sep 2021 15:54:51 -0700
Subject: [PATCH 143/926] Define the VPX_NO_RETURN macro for MSVC

Define VPX_NO_RETURN as __declspec(noreturn) for MSVC. See
https://docs.microsoft.com/en-us/cpp/cpp/noreturn?view=msvc-160

This requires moving VPX_NO_RETURN before function declarations because
__declspec(noreturn) must be placed there. Fortunately GCC's
__attribute__((noreturn)) can be placed either before or after function
declarations.

Change-Id: Id9bb0077e2a4f16ec2ca9c913dd93673a0e385cf
---
 args.c         |  6 ++++--
 tools_common.h | 10 ++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/args.c b/args.c
index a87b138b9d..17b615584e 100644
--- a/args.c
+++ b/args.c
@@ -16,8 +16,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 
-#if defined(__GNUC__) && __GNUC__
-extern void die(const char *fmt, ...) __attribute__((noreturn));
+#if defined(__GNUC__)
+__attribute__((noreturn)) extern void die(const char *fmt, ...);
+#elif defined(_MSC_VER)
+__declspec(noreturn) extern void die(const char *fmt, ...);
 #else
 extern void die(const char *fmt, ...);
 #endif
diff --git a/tools_common.h b/tools_common.h
index 4526d9f165..4e8851fc15 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -110,6 +110,8 @@ extern "C" {
 
 #if defined(__GNUC__)
 #define VPX_NO_RETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define VPX_NO_RETURN __declspec(noreturn)
 #else
 #define VPX_NO_RETURN
 #endif
@@ -117,14 +119,14 @@ extern "C" {
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...) VPX_NO_RETURN;
-void fatal(const char *fmt, ...) VPX_NO_RETURN;
+VPX_NO_RETURN void die(const char *fmt, ...);
+VPX_NO_RETURN void fatal(const char *fmt, ...);
 void warn(const char *fmt, ...);
 
-void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
+VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s);
 
 /* The tool including this file must define usage_exit() */
-void usage_exit(void) VPX_NO_RETURN;
+VPX_NO_RETURN void usage_exit(void);
 
 #undef VPX_NO_RETURN
 

From 7366195e5a7098de0b7c131f40dd5238b9065a56 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 16 Sep 2021 10:19:09 -0700
Subject: [PATCH 144/926] vp8 rc: explicit cast to avoid VS build failure

Change-Id: I6a4daca12b79cf996964661e1af85aa6e258b446
---
 vp8/vp8_ratectrl_rtc.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index c42ab971e2..b489940cb7 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -100,7 +100,8 @@ void VP8RateControlRTC::UpdateRateControl(
   cpi_->worst_quality = oxcf->worst_allowed_q;
   cpi_->best_quality = oxcf->best_allowed_q;
   cpi_->output_framerate = rc_cfg.framerate;
-  oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth;
+  oxcf->target_bandwidth =
+      static_cast<unsigned int>(1000 * rc_cfg.target_bandwidth);
   oxcf->fixed_q = -1;
   oxcf->error_resilient_mode = 1;
   oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz;

From 09775194ffdb84b4979f3988e7ef301575b661df Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 20 Sep 2021 13:37:43 -0700
Subject: [PATCH 145/926] Cap duration to avoid overflow

Bug: webm:1728
Change-Id: Id13475660fa921e8ddcc89847e978da4c8d85886
---
 vp8/encoder/onyx_if.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index fc154afd14..cdcb0a09f7 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -4921,6 +4921,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
       this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
       last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+      // Cap this to avoid overflow of (this_duration - last_duration) * 10
+      this_duration = VPXMIN(this_duration, INT64_MAX / 10);
       /* do a step update if the duration changes by 10% */
       if (last_duration) {
         step = (int)(((this_duration - last_duration) * 10 / last_duration));

From 0de415cf6a945457115783807a702a5249f44a9d Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 16 Sep 2021 10:16:44 -0700
Subject: [PATCH 146/926] vp8 rc: support temporal layers

Change-Id: I2c7d5de0e17b072cb763f1659b1badce4fe0b82b
---
 test/vp8_ratectrl_rtc_test.cc | 181 ++++++++++++++++++++++++++++++++--
 vp8/encoder/onyx_if.c         |  40 ++++----
 vp8/encoder/onyx_int.h        |   6 ++
 vp8/vp8_ratectrl_rtc.cc       |  61 +++++++++++-
 vp8/vp8_ratectrl_rtc.h        |   1 +
 5 files changed, 257 insertions(+), 32 deletions(-)

diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index d5032b38e7..ad310666e7 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -61,20 +61,81 @@ class Vp8RcInterfaceTest
     SetMode(::libvpx_test::kRealTime);
   }
 
+  // From error_resilience_test.cc
+  int SetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags =
+            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L, update L.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G,  update G.
+        frame_flags =
+            VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARG.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      encoder->Control(VP8E_SET_CPUUSED, -6);
-      encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
-      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+    if (rc_cfg_.ts_number_layers > 1) {
+      const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
+      const int frame_flags =
+          SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      frame_params_.temporal_layer_id = layer_id;
+      if (video->frame() > 0) {
+        encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+        encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags);
+      }
+    } else {
+      if (video->frame() == 0) {
+        encoder->Control(VP8E_SET_CPUUSED, -6);
+        encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
+        encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+      }
+      if (frame_params_.frame_type == INTER_FRAME) {
+        // Disable golden frame update.
+        frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+        frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+      }
     }
     frame_params_.frame_type =
         video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
-    if (frame_params_.frame_type == INTER_FRAME) {
-      // Disable golden frame update.
-      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
-      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
-    }
     encoder_exit_ = video->frame() == test_video_.frames;
   }
 
@@ -125,6 +186,38 @@ class Vp8RcInterfaceTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunTemporalLayers2TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfigTemporalLayers(2);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers3TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfigTemporalLayers(3);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
  private:
   void SetConfig() {
     rc_cfg_.width = test_video_.width;
@@ -160,6 +253,72 @@ class Vp8RcInterfaceTest
     cfg_.kf_max_dist = key_interval_;
   }
 
+  void SetConfigTemporalLayers(int temporal_layers) {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    if (temporal_layers == 2) {
+      rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (temporal_layers == 3) {
+      rc_cfg_.layer_target_bitrate[0] = 40 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[2] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    }
+
+    rc_cfg_.ts_number_layers = temporal_layers;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    // 2 Temporal layers, no spatial layers, CBR mode.
+    cfg_.ss_number_layers = 1;
+    cfg_.ts_number_layers = temporal_layers;
+    if (temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.ts_periodicity = 2;
+      cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+    } else if (temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.ts_periodicity = 4;
+      cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    }
+  }
+
   std::unique_ptr<libvpx::VP8RateControlRTC> rc_api_;
   libvpx::VP8RateControlRtcConfig rc_cfg_;
   int key_interval_;
@@ -173,6 +332,10 @@ TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
 TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
 
+TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); }
+
 VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest,
                            ::testing::Values(200, 400, 1000),
                            ::testing::ValuesIn(kVp8RCTestVectors));
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index fc154afd14..5e00732786 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -183,7 +183,7 @@ static const unsigned char inter_minq[QINDEX_RANGE] = {
 extern FILE *vpxlogc;
 #endif
 
-static void save_layer_context(VP8_COMP *cpi) {
+void vp8_save_layer_context(VP8_COMP *cpi) {
   LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
 
   /* Save layer dependent coding state */
@@ -222,7 +222,7 @@ static void save_layer_context(VP8_COMP *cpi) {
          sizeof(cpi->mb.count_mb_ref_frame_usage));
 }
 
-static void restore_layer_context(VP8_COMP *cpi, const int layer) {
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   /* Restore layer dependent coding state */
@@ -269,9 +269,9 @@ static int rescale(int val, int num, int denom) {
   return (int)(llval * llnum / llden);
 }
 
-static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int layer,
-                                        double prev_layer_framerate) {
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
@@ -336,12 +336,12 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
   // We need this to set the layer context for the new layers below.
   if (prev_num_layers == 1) {
     cpi->current_layer = 0;
-    save_layer_context(cpi);
+    vp8_save_layer_context(cpi);
   }
   for (i = 0; i < curr_num_layers; ++i) {
     LAYER_CONTEXT *lc = &cpi->layer_context[i];
     if (i >= prev_num_layers) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
     }
     // The initial buffer levels are set based on their starting levels.
     // We could set the buffer levels based on the previous state (normalized
@@ -356,7 +356,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
     // state (to smooth-out quality dips/rate fluctuation at transition)?
 
     // We need to treat the 1 layer case separately: oxcf.target_bitrate[i]
-    // is not set for 1 layer, and the restore_layer_context/save_context()
+    // is not set for 1 layer, and the vp8_restore_layer_context/save_context()
     // are not called in the encoding loop, so we need to call it here to
     // pass the layer context state to |cpi|.
     if (curr_num_layers == 1) {
@@ -364,7 +364,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
       lc->buffer_level =
           cpi->oxcf.starting_buffer_level_in_ms * lc->target_bandwidth / 1000;
       lc->bits_off_target = lc->buffer_level;
-      restore_layer_context(cpi, 0);
+      vp8_restore_layer_context(cpi, 0);
     }
     prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i];
   }
@@ -1274,7 +1274,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) {
   cpi->framerate = framerate;
   cpi->output_framerate = framerate;
   cpi->per_frame_bandwidth =
-      (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
+      (int)round(cpi->oxcf.target_bandwidth / cpi->output_framerate);
   cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
   cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
                                    cpi->oxcf.two_pass_vbrmin_section / 100);
@@ -1365,7 +1365,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
     double prev_layer_framerate = 0;
 
     for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
       prev_layer_framerate =
           cpi->output_framerate / cpi->oxcf.rate_decimator[i];
     }
@@ -1382,7 +1382,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 #endif
 }
 
-static void update_layer_contexts(VP8_COMP *cpi) {
+void vp8_update_layer_contexts(VP8_COMP *cpi) {
   VP8_CONFIG *oxcf = &cpi->oxcf;
 
   /* Update snapshots of the layer contexts to reflect new parameters */
@@ -1417,8 +1417,8 @@ static void update_layer_contexts(VP8_COMP *cpi) {
       /* Work out the average size of a frame within this layer */
       if (i > 0) {
         lc->avg_frame_size_for_layer =
-            (int)((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
-                  1000 / (lc->framerate - prev_layer_framerate));
+            (int)round((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
+                       1000 / (lc->framerate - prev_layer_framerate));
       }
 
       prev_layer_framerate = lc->framerate;
@@ -3261,7 +3261,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif  // !CONFIG_REALTIME_ONLY
     default:
       cpi->per_frame_bandwidth =
-          (int)(cpi->target_bandwidth / cpi->output_framerate);
+          (int)round(cpi->target_bandwidth / cpi->output_framerate);
       break;
   }
 
@@ -4554,8 +4554,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
     for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
       LAYER_CONTEXT *lc = &cpi->layer_context[i];
-      int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
-                                          cpi->projected_frame_size);
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi->projected_frame_size);
 
       lc->bits_off_target += bits_off_for_this_layer;
 
@@ -4990,7 +4990,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   if (cpi->oxcf.number_of_layers > 1) {
     int layer;
 
-    update_layer_contexts(cpi);
+    vp8_update_layer_contexts(cpi);
 
     /* Restore layer specific context & set frame rate */
     if (cpi->temporal_layer_id >= 0) {
@@ -5000,7 +5000,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
           cpi->oxcf
               .layer_id[cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
     }
-    restore_layer_context(cpi, layer);
+    vp8_restore_layer_context(cpi, layer);
     vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
   }
 
@@ -5131,7 +5131,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   }
 
   /* Save layer specific state */
-  if (cpi->oxcf.number_of_layers > 1) save_layer_context(cpi);
+  if (cpi->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi);
 
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index a29994a135..7f8298e44a 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -712,6 +712,12 @@ void vp8_initialize_enc(void);
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate);
+void vp8_update_layer_contexts(VP8_COMP *cpi);
+void vp8_save_layer_context(VP8_COMP *cpi);
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer);
 void vp8_new_framerate(VP8_COMP *cpi, double framerate);
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index b489940cb7..2098edaf97 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -8,9 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
 #include <new>
 #include "vp8/vp8_ratectrl_rtc.h"
 #include "vp8/encoder/ratectrl.h"
+#include "vpx_ports/system_state.h"
 
 namespace libvpx {
 /* Quant MOD */
@@ -90,7 +92,7 @@ void VP8RateControlRTC::UpdateRateControl(
     const VP8RateControlRtcConfig &rc_cfg) {
   VP8_COMMON *cm = &cpi_->common;
   VP8_CONFIG *oxcf = &cpi_->oxcf;
-
+  vpx_clear_system_state();
   cm->Width = rc_cfg.width;
   cm->Height = rc_cfg.height;
   oxcf->Width = rc_cfg.width;
@@ -102,6 +104,7 @@ void VP8RateControlRTC::UpdateRateControl(
   cpi_->output_framerate = rc_cfg.framerate;
   oxcf->target_bandwidth =
       static_cast<unsigned int>(1000 * rc_cfg.target_bandwidth);
+  cpi_->ref_framerate = cpi_->output_framerate;
   oxcf->fixed_q = -1;
   oxcf->error_resilient_mode = 1;
   oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz;
@@ -110,7 +113,7 @@ void VP8RateControlRTC::UpdateRateControl(
   oxcf->starting_buffer_level = rc_cfg.buf_initial_sz;
   oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz;
   oxcf->maximum_buffer_size = rc_cfg.buf_sz;
-  oxcf->number_of_layers = 1;
+  oxcf->number_of_layers = rc_cfg.ts_number_layers;
   cpi_->buffered_mode = oxcf->optimal_buffer_level > 0;
   oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
@@ -121,6 +124,20 @@ void VP8RateControlRTC::UpdateRateControl(
         static_cast<int>(cpi_->output_framerate);
   }
 
+  if (oxcf->number_of_layers > 1) {
+    memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
+           sizeof(rc_cfg.layer_target_bitrate));
+    memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator,
+           sizeof(rc_cfg.ts_rate_decimator));
+    oxcf->periodicity = 2;
+
+    double prev_layer_framerate = 0;
+    for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
+      vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
+      prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+    }
+  }
+
   cpi_->total_actual_bits = 0;
   cpi_->total_target_vs_actual = 0;
 
@@ -155,6 +172,15 @@ void VP8RateControlRTC::UpdateRateControl(
 
 void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
   VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  if (cpi_->oxcf.number_of_layers > 1) {
+    cpi_->temporal_layer_id = frame_params.temporal_layer_id;
+    const int layer = frame_params.temporal_layer_id;
+    vp8_update_layer_contexts(cpi_);
+    /* Restore layer specific context & set frame rate */
+    vp8_restore_layer_context(cpi_, layer);
+    vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
+  }
   cm->frame_type = frame_params.frame_type;
   cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
   cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
@@ -231,9 +257,15 @@ int VP8RateControlRTC::GetQP() const { return q_; }
 
 void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
   VP8_COMMON *const cm = &cpi_->common;
-
+  vpx_clear_system_state();
   cpi_->total_byte_count += encoded_frame_size;
   cpi_->projected_frame_size = static_cast<int>(encoded_frame_size << 3);
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      cpi_->layer_context[i].total_byte_count += encoded_frame_size;
+    }
+  }
 
   vp8_update_rate_correction_factors(cpi_, 2);
 
@@ -283,7 +315,30 @@ void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
   cpi_->total_actual_bits += cpi_->projected_frame_size;
   cpi_->buffer_level = cpi_->bits_off_target;
 
+  /* Propagate values to higher temporal layers */
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      LAYER_CONTEXT *lc = &cpi_->layer_context[i];
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi_->projected_frame_size);
+
+      lc->bits_off_target += bits_off_for_this_layer;
+
+      /* Clip buffer level to maximum buffer size for the layer */
+      if (lc->bits_off_target > lc->maximum_buffer_size) {
+        lc->bits_off_target = lc->maximum_buffer_size;
+      }
+
+      lc->total_actual_bits += cpi_->projected_frame_size;
+      lc->total_target_vs_actual += bits_off_for_this_layer;
+      lc->buffer_level = lc->bits_off_target;
+    }
+  }
+
   cpi_->common.current_video_frame++;
   cpi_->frames_since_key++;
+
+  if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
 }
 }  // namespace libvpx
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index a1cd52b051..def7dd8f9e 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -29,6 +29,7 @@ struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
 
 struct VP8FrameParamsQpRTC {
   FRAME_TYPE frame_type;
+  int temporal_layer_id;
 };
 
 class VP8RateControlRTC {

From b68877a7ebfe764714f8ce7aeb2a7f6d12b77989 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 24 Sep 2021 14:56:00 -0700
Subject: [PATCH 147/926] vp8 rc: Clear system state at the end of calls

Clear system state at the end of rc calls to make sure the state is
consistent before and after

Change-Id: I59fe9c99485b1a8603c20db37961339b7575455f
---
 vp8/vp8_ratectrl_rtc.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index 2098edaf97..2f23c5b1d9 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -168,6 +168,7 @@ void VP8RateControlRTC::UpdateRateControl(
   }
 
   vp8_new_framerate(cpi_, cpi_->framerate);
+  vpx_clear_system_state();
 }
 
 void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
@@ -251,6 +252,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
 
   q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target);
   vp8_set_quantizer(cpi_, q_);
+  vpx_clear_system_state();
 }
 
 int VP8RateControlRTC::GetQP() const { return q_; }
@@ -340,5 +342,6 @@ void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
   cpi_->frames_since_key++;
 
   if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+  vpx_clear_system_state();
 }
 }  // namespace libvpx

From 5df4195b43e5b69572cdb1903d67d6f6c2917285 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 10 Sep 2021 15:54:51 -0700
Subject: [PATCH 148/926] Define the VPX_NO_RETURN macro for MSVC

Define VPX_NO_RETURN as __declspec(noreturn) for MSVC. See
https://docs.microsoft.com/en-us/cpp/cpp/noreturn?view=msvc-160

This requires moving VPX_NO_RETURN before function declarations because
__declspec(noreturn) must be placed there. Fortunately GCC's
__attribute__((noreturn)) can be placed either before or after function
declarations.

Change-Id: Id9bb0077e2a4f16ec2ca9c913dd93673a0e385cf
(cherry picked from commit 8a6fbc0b4eb8538e213782bcdc3969a08b44e73b)
---
 args.c         |  6 ++++--
 tools_common.h | 10 ++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/args.c b/args.c
index a87b138b9d..17b615584e 100644
--- a/args.c
+++ b/args.c
@@ -16,8 +16,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 
-#if defined(__GNUC__) && __GNUC__
-extern void die(const char *fmt, ...) __attribute__((noreturn));
+#if defined(__GNUC__)
+__attribute__((noreturn)) extern void die(const char *fmt, ...);
+#elif defined(_MSC_VER)
+__declspec(noreturn) extern void die(const char *fmt, ...);
 #else
 extern void die(const char *fmt, ...);
 #endif
diff --git a/tools_common.h b/tools_common.h
index 4526d9f165..4e8851fc15 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -110,6 +110,8 @@ extern "C" {
 
 #if defined(__GNUC__)
 #define VPX_NO_RETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define VPX_NO_RETURN __declspec(noreturn)
 #else
 #define VPX_NO_RETURN
 #endif
@@ -117,14 +119,14 @@ extern "C" {
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...) VPX_NO_RETURN;
-void fatal(const char *fmt, ...) VPX_NO_RETURN;
+VPX_NO_RETURN void die(const char *fmt, ...);
+VPX_NO_RETURN void fatal(const char *fmt, ...);
 void warn(const char *fmt, ...);
 
-void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
+VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s);
 
 /* The tool including this file must define usage_exit() */
-void usage_exit(void) VPX_NO_RETURN;
+VPX_NO_RETURN void usage_exit(void);
 
 #undef VPX_NO_RETURN
 

From d00e68ad8789dc8bb210961532e20f0e9f6d55ae Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 20 Sep 2021 13:37:43 -0700
Subject: [PATCH 149/926] Cap duration to avoid overflow

Bug: webm:1728
Change-Id: Id13475660fa921e8ddcc89847e978da4c8d85886
(cherry picked from commit 09775194ffdb84b4979f3988e7ef301575b661df)
---
 vp8/encoder/onyx_if.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 71ef057a4a..c6c162347f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -4919,6 +4919,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
       this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
       last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+      // Cap this to avoid overflow of (this_duration - last_duration) * 10
+      this_duration = VPXMIN(this_duration, INT64_MAX / 10);
       /* do a step update if the duration changes by 10% */
       if (last_duration) {
         step = (int)(((this_duration - last_duration) * 10 / last_duration));

From 16837ae1680bbc73381570cc783439b0ea121ba6 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 27 Sep 2021 15:52:53 -0700
Subject: [PATCH 150/926] CHANGELOG for Smew v1.11.0

Bug: webm:1732
Change-Id: I6038f401cf1dfdcaca85b81d0b8b2c04967b44dd
---
 CHANGELOG | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 6338caa380..ea2fc9d81c 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,29 @@
+2021-09-27 v1.11.0 "Smew Duck"
+  This maintenance release adds support for VBR mode in VP9 rate control
+  interface, new codec controls to get quantization parameters and loop filter
+  levels, and includes several improvements to NEON and numerous bug fixes.
+
+  - Upgrading:
+    New codec control is added to get quantization parameters and loop filter
+    levels.
+
+    VBR mode is supported in VP9 rate control library.
+
+  - Enhancement:
+    Numerous improvements for Neon optimizations.
+    Code clean-up and refactoring.
+    Calculation of rd multiplier is changed with BDRATE gains.
+
+  - Bug fixes:
+    Fix to overflow on duration.
+    Fix to several instances of -Wunused-but-set-variable.
+    Fix to avoid chroma resampling for 420mpeg2 input.
+    Fix to overflow in calc_iframe_target_size.
+    Fix to disallow skipping transform and quantization.
+    Fix some -Wsign-compare warnings in simple_encode.
+    Fix input file path in simple_encode_test.
+    Fix valid range for under/over_shoot pct.
+
 2021-03-09 v1.10.0 "Ruddy Duck"
   This maintenance release adds support for darwin20 and new codec controls, as
   well as numerous bug fixes.

From f8733b3fb7eb3cf1154a9e693351097ec42005a2 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Fri, 1 Oct 2021 11:54:53 -0700
Subject: [PATCH 151/926] vp8: For screen mode: clip buffer from below

Condition already existed for screen content mode,
but only when frame-dropper was off. Remove the
frame drop condition.

Change-Id: Ie7357041f5ca05b01e78b4bd3b40da060382591b
---
 vp8/encoder/onyx_if.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c57c746469..6890a470a9 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -4516,10 +4516,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
   }
 
-  // If the frame dropper is not enabled, don't let the buffer level go below
-  // some threshold, given here by -|maximum_buffer_size|. For now we only do
-  // this for screen content input.
-  if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+  // Don't let the buffer level go below some threshold, given here
+  // by -|maximum_buffer_size|. For now we only do this for
+  // screen content input.
+  if (cpi->oxcf.screen_content_mode &&
       cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size) {
     cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
   }

From 167de33ca8b24f072a88fcf51fbe782763717d00 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Fri, 1 Oct 2021 13:16:56 -0700
Subject: [PATCH 152/926] vp8: Condition decimation drop logic on
 drop_frames_allowed

This allows user to make sure frame will be encoded
when drop_frames is set off (on the fly), no matter
the state of the buffer.

Change-Id: Ia7b39b93fe3721dd586bdbede72c525db87b6890
---
 vp8/encoder/onyx_if.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c57c746469..8466bba7c7 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3481,7 +3481,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
    * Note that dropping a key frame can be problematic if spatial
    * resampling is also active
    */
-  if (cpi->decimation_factor > 0) {
+  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
     switch (cpi->decimation_factor) {
       case 1:
         cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;

From fccaa5fa7a3e134949f5ea9fe3d4f3c388d4243b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 1 Oct 2021 13:46:02 -0700
Subject: [PATCH 153/926] {vp8,vp9}_set_roi_map: fix validation with INT_MIN

previously ranges were checked with abs() whose behavior is undefined
with INT_MIN. this fixes a crash when the original value is returned and
it later used as and offset into a table.

Bug: webm:1742
Change-Id: I345970b75c46699587a4fbc4a059e59277f4c2c8
---
 test/encode_api_test.cc   | 122 ++++++++++++++++++++++++++++++++++++--
 vp8/encoder/onyx_if.c     |  18 +++---
 vp9/encoder/vp9_encoder.c |  13 ++--
 3 files changed, 134 insertions(+), 19 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 6bd7e593da..dec19b2268 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -8,6 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <climits>
+#include <cstring>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
@@ -18,6 +21,12 @@ namespace {
 
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
+bool IsVP9(const vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
 TEST(EncodeAPI, InvalidParams) {
   static const vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_ENCODER
@@ -184,10 +193,7 @@ TEST(EncodeAPI, MultiResEncode) {
     }
 
     // VP9 should report incapable, VP8 invalid for all configurations.
-    const char kVP9Name[] = "WebM Project VP9";
-    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
-                                sizeof(kVP9Name) - 1) == 0;
-    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+    EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
               vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
 
     for (int i = 0; i < 2; i++) {
@@ -196,4 +202,112 @@ TEST(EncodeAPI, MultiResEncode) {
   }
 }
 
+TEST(EncodeAPI, SetRoi) {
+  static struct {
+    const vpx_codec_iface_t *iface;
+    int ctrl_id;
+  } kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP },
+#endif
+#if CONFIG_VP9_ENCODER
+    { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP },
+#endif
+  };
+  constexpr int kWidth = 64;
+  constexpr int kHeight = 64;
+
+  for (const auto &codec : kCodecs) {
+    SCOPED_TRACE(vpx_codec_iface_name(codec.iface));
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK);
+    cfg.g_w = kWidth;
+    cfg.g_h = kHeight;
+    EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK);
+
+    vpx_roi_map_t roi = {};
+    uint8_t roi_map[kWidth * kHeight] = {};
+    if (IsVP9(codec.iface)) {
+      roi.rows = (cfg.g_w + 7) >> 3;
+      roi.cols = (cfg.g_h + 7) >> 3;
+    } else {
+      roi.rows = (cfg.g_w + 15) >> 4;
+      roi.cols = (cfg.g_h + 15) >> 4;
+    }
+    EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+
+    roi.roi_map = roi_map;
+    // VP8 only. This value isn't range checked.
+    roi.static_threshold[1] = 1000;
+    roi.static_threshold[2] = INT_MIN;
+    roi.static_threshold[3] = INT_MAX;
+
+    for (const auto delta : { -63, -1, 0, 1, 63 }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        roi.delta_lf[i] = delta;
+        // VP9 only.
+        roi.skip[i] ^= 1;
+        roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+      }
+    }
+
+    vpx_codec_err_t expected_error;
+    for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) {
+      expected_error = VPX_CODEC_INVALID_PARAM;
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        // The max segment count for VP8 is 4, the remainder of the entries are
+        // ignored.
+        if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK;
+
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_q[" << i << "]: " << delta;
+        roi.delta_q[i] = 0;
+
+        roi.delta_lf[i] = delta;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_lf[" << i << "]: " << delta;
+        roi.delta_lf[i] = 0;
+      }
+    }
+
+    // VP8 should ignore skip[] and ref_frame[] values.
+    expected_error =
+        IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK;
+    for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.skip[i] = skip;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "skip[" << i << "]: " << skip;
+        roi.skip[i] = 0;
+      }
+    }
+
+    // VP9 allows negative values to be used to disable segmentation.
+    for (int ref_frame = -3; ref_frame < 0; ++ref_frame) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
 }  // namespace
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c57c746469..1b181cebe4 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -5320,17 +5320,13 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
     return -1;
   }
 
-  // Range check the delta Q values and convert the external Q range values
-  // to internal ones.
-  if ((abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
-      (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range)) {
-    return -1;
-  }
-
-  // Range check the delta lf values
-  if ((abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
-      (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range)) {
-    return -1;
+  for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range ||
+        delta_lf[i] < -range) {
+      return -1;
+    }
   }
 
   // Also disable segmentation if no deltas are specified.
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6cd8cb80e8..7e80835f6c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -654,10 +654,15 @@ static void init_level_info(Vp9LevelInfo *level_info) {
 }
 
 static int check_seg_range(int seg_data[8], int range) {
-  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
-           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
-           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
-           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
+  int i;
+  for (i = 0; i < 8; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (seg_data[i] > range || seg_data[i] < -range) {
+      return 0;
+    }
+  }
+  return 1;
 }
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {

From fe3b58cffa250aaa8c20ee2e53ce3e92cac3f440 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 1 Oct 2021 15:42:50 -0700
Subject: [PATCH 154/926] vpx_roi_map: add delta range info

Change-Id: If2ef4400562075b4e7abadc01638a46c0c7f1859
---
 vpx/vp8cx.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 28bd861747..47c38d3b5e 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -808,8 +808,8 @@ typedef struct vpx_roi_map {
   unsigned int rows; /**< Number of rows. */
   unsigned int cols; /**< Number of columns. */
   /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */
-  int delta_q[8];  /**< Quantizer deltas. */
-  int delta_lf[8]; /**< Loop filter deltas. */
+  int delta_q[8];  /**< Quantizer deltas. Valid range: [-63, 63].*/
+  int delta_lf[8]; /**< Loop filter deltas. Valid range: [-63, 63].*/
   /*! skip and ref frame segment is only used in VP9. */
   int skip[8];      /**< Skip this block. */
   int ref_frame[8]; /**< Reference frame for this block. */

From 2ea1b908d87b29bcc6214efd3073b92392d495ff Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 1 Oct 2021 13:46:02 -0700
Subject: [PATCH 155/926] {vp8,vp9}_set_roi_map: fix validation with INT_MIN

previously ranges were checked with abs() whose behavior is undefined
with INT_MIN. this fixes a crash when the original value is returned and
it later used as and offset into a table.

Bug: webm:1742
Change-Id: I345970b75c46699587a4fbc4a059e59277f4c2c8
---
 test/encode_api_test.cc   | 122 ++++++++++++++++++++++++++++++++++++--
 vp8/encoder/onyx_if.c     |  18 +++---
 vp9/encoder/vp9_encoder.c |  13 ++--
 3 files changed, 134 insertions(+), 19 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 6bd7e593da..dec19b2268 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -8,6 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <climits>
+#include <cstring>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
@@ -18,6 +21,12 @@ namespace {
 
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
+bool IsVP9(const vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
 TEST(EncodeAPI, InvalidParams) {
   static const vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_ENCODER
@@ -184,10 +193,7 @@ TEST(EncodeAPI, MultiResEncode) {
     }
 
     // VP9 should report incapable, VP8 invalid for all configurations.
-    const char kVP9Name[] = "WebM Project VP9";
-    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
-                                sizeof(kVP9Name) - 1) == 0;
-    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+    EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
               vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
 
     for (int i = 0; i < 2; i++) {
@@ -196,4 +202,112 @@ TEST(EncodeAPI, MultiResEncode) {
   }
 }
 
+TEST(EncodeAPI, SetRoi) {
+  static struct {
+    const vpx_codec_iface_t *iface;
+    int ctrl_id;
+  } kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP },
+#endif
+#if CONFIG_VP9_ENCODER
+    { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP },
+#endif
+  };
+  constexpr int kWidth = 64;
+  constexpr int kHeight = 64;
+
+  for (const auto &codec : kCodecs) {
+    SCOPED_TRACE(vpx_codec_iface_name(codec.iface));
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK);
+    cfg.g_w = kWidth;
+    cfg.g_h = kHeight;
+    EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK);
+
+    vpx_roi_map_t roi = {};
+    uint8_t roi_map[kWidth * kHeight] = {};
+    if (IsVP9(codec.iface)) {
+      roi.rows = (cfg.g_w + 7) >> 3;
+      roi.cols = (cfg.g_h + 7) >> 3;
+    } else {
+      roi.rows = (cfg.g_w + 15) >> 4;
+      roi.cols = (cfg.g_h + 15) >> 4;
+    }
+    EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+
+    roi.roi_map = roi_map;
+    // VP8 only. This value isn't range checked.
+    roi.static_threshold[1] = 1000;
+    roi.static_threshold[2] = INT_MIN;
+    roi.static_threshold[3] = INT_MAX;
+
+    for (const auto delta : { -63, -1, 0, 1, 63 }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        roi.delta_lf[i] = delta;
+        // VP9 only.
+        roi.skip[i] ^= 1;
+        roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+      }
+    }
+
+    vpx_codec_err_t expected_error;
+    for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) {
+      expected_error = VPX_CODEC_INVALID_PARAM;
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        // The max segment count for VP8 is 4, the remainder of the entries are
+        // ignored.
+        if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK;
+
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_q[" << i << "]: " << delta;
+        roi.delta_q[i] = 0;
+
+        roi.delta_lf[i] = delta;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_lf[" << i << "]: " << delta;
+        roi.delta_lf[i] = 0;
+      }
+    }
+
+    // VP8 should ignore skip[] and ref_frame[] values.
+    expected_error =
+        IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK;
+    for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.skip[i] = skip;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "skip[" << i << "]: " << skip;
+        roi.skip[i] = 0;
+      }
+    }
+
+    // VP9 allows negative values to be used to disable segmentation.
+    for (int ref_frame = -3; ref_frame < 0; ++ref_frame) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
 }  // namespace
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c6c162347f..2b059a1e44 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -5318,17 +5318,13 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
     return -1;
   }
 
-  // Range check the delta Q values and convert the external Q range values
-  // to internal ones.
-  if ((abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
-      (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range)) {
-    return -1;
-  }
-
-  // Range check the delta lf values
-  if ((abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
-      (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range)) {
-    return -1;
+  for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range ||
+        delta_lf[i] < -range) {
+      return -1;
+    }
   }
 
   // Also disable segmentation if no deltas are specified.
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6cd8cb80e8..7e80835f6c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -654,10 +654,15 @@ static void init_level_info(Vp9LevelInfo *level_info) {
 }
 
 static int check_seg_range(int seg_data[8], int range) {
-  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
-           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
-           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
-           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
+  int i;
+  for (i = 0; i < 8; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (seg_data[i] > range || seg_data[i] < -range) {
+      return 0;
+    }
+  }
+  return 1;
 }
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {

From 626ff35955c2c35b806b3e0ecf551a1a8611cdbf Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 5 Oct 2021 15:57:34 -0700
Subject: [PATCH 156/926] Update AUTHORS and version info in libs.mk

Bug: webm:1732
Change-Id: I29ce77c7d02bd2f5cb0ef8412333df032744b668
---
 AUTHORS | 6 ++++++
 libs.mk | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index e804842f78..174cc59ee7 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -3,6 +3,7 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam B. Goode <adam.mckee84@gmail.com>
 Adrian Grange <agrange@google.com>
 Ahmad Sharif <asharif@google.com>
 Aidan Welch <aidansw@yahoo.com>
@@ -25,6 +26,7 @@ Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Birk Magnussen <birk.magnussen@googlemail.com>
+Bohan Li <bohanli@google.com>
 Brian Foley <bpfoley@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
@@ -34,6 +36,7 @@ Chi Yo Tsai <chiyotsai@google.com>
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
+Chunbo Hua <chunbo.hua@intel.com>
 Clement Courbet <courbet@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
@@ -68,6 +71,7 @@ Han Shen <shenhan@google.com>
 Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
 Hien Ho <hienho@google.com>
+Hirokazu Honda <hiroh@chromium.org>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
@@ -97,6 +101,7 @@ Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
+Jonathan Wright <jonathan.wright@arm.com>
 Jon Kunkee <jkunkee@microsoft.com>
 Jorge E. Moreira <jemoreira@google.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
@@ -146,6 +151,7 @@ Pengchong Jin <pengchong@google.com>
 Peter Boström <pbos@chromium.org>
 Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
+Peter Kasting <pkasting@chromium.org>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
diff --git a/libs.mk b/libs.mk
index f5b43abadc..7cd973bd18 100644
--- a/libs.mk
+++ b/libs.mk
@@ -299,8 +299,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 6
-SO_VERSION_MINOR := 4
+SO_VERSION_MAJOR := 7
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib

From 27b8a778bdbaaf803c2a15eb4d96837757480106 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 8 Oct 2021 16:24:23 -0700
Subject: [PATCH 157/926] vp8_yv12_realloc_frame_buffer: move allocation check

to before the memset used under msan to avoid any spurious reports in
OOM conditions

Change-Id: I0c4ee92829bbcb356e94f503a4615caf891bb49d
---
 vpx_scale/generic/yv12config.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index eee291c30d..c52dab0588 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -64,6 +64,10 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width,
 
     if (!ybf->buffer_alloc) {
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
+      if (!ybf->buffer_alloc) {
+        ybf->buffer_alloc_sz = 0;
+        return -1;
+      }
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
       // This memset is needed for fixing the issue of using uninitialized
@@ -75,7 +79,7 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width,
       ybf->buffer_alloc_sz = frame_size;
     }
 
-    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) return -1;
+    if (ybf->buffer_alloc_sz < frame_size) return -1;
 
     /* Only support allocating buffers that have a border that's a multiple
      * of 32. The border restriction is required to get 16-byte alignment of

From 9039995e94ef5c70e6f3cbf5fe43367da18a0dcc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 9 Oct 2021 10:33:37 -0700
Subject: [PATCH 158/926] Android.mk: import LICENSE indicators from AOSP

https://android-review.googlesource.com/c/platform/external/libvpx/+/1588942
https://android.googlesource.com/platform/external/libvpx/+/099828b5c770ef8630741721be4b6c25a8394204

Change-Id: Ieca1c882f82bcbc7546944b43af7fab358f925d2
---
 build/make/Android.mk | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/build/make/Android.mk b/build/make/Android.mk
index 6cb3af027b..b8032e67aa 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -166,6 +166,9 @@ LOCAL_CFLAGS += \
     -I$(ASM_CNV_PATH)/libvpx
 
 LOCAL_MODULE := libvpx
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
   LOCAL_STATIC_LIBRARIES := cpufeatures

From e259e6951d794ca6a6f2f3c9c40c5c99818613d3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 12 Oct 2021 11:57:39 -0700
Subject: [PATCH 159/926] test/Android.mk: import LICENSE indicators from AOSP

https://android-review.googlesource.com/c/platform/external/libvpx/+/1853628
https://android.googlesource.com/platform/external/libvpx/+/e40f8afb1e51d3bd13d662c1881e3cfb616fa2b8

Change-Id: I15f185ab7c7661f4456c4ad7296fdda01dfb8d53
---
 test/android/Android.mk | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/android/Android.mk b/test/android/Android.mk
index 7318de2fc4..87155fcb58 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk
@@ -34,6 +34,9 @@ LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 include $(BUILD_STATIC_LIBRARY)
 
 #libvpx_test
@@ -48,6 +51,9 @@ else
   LOCAL_STATIC_LIBRARIES += vpx
 endif
 
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 include $(LOCAL_PATH)/test/test.mk
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
 FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))

From 340f60524ffa35c7324c54fe404d84cc1a1ac402 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 2 Nov 2021 16:29:52 -0700
Subject: [PATCH 160/926] vpx_codec_internal.h: add LIBVPX_FORMAT_PRINTF

and use it to set the format attribute for the printf like function
vpx_internal_error(). this allows the main library to be built with
-Wformat-nonliteral without producing warnings; the examples will be
handled in a followup.

Bug: webm:1744
Change-Id: Iebc322e24db35d902c5a2b1ed767d2e10e9c91b9
---
 vpx/internal/vpx_codec_internal.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 961b0bfe4c..670fe380ed 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -435,9 +435,21 @@ struct vpx_internal_error_info {
 #endif
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef LIBVPX_FORMAT_PRINTF
+#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 void vpx_internal_error(struct vpx_internal_error_info *info,
-                        vpx_codec_err_t error, const char *fmt,
-                        ...) CLANG_ANALYZER_NORETURN;
+                        vpx_codec_err_t error, const char *fmt, ...)
+    LIBVPX_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN;
 
 #ifdef __cplusplus
 }  // extern "C"

From dd10ac8f69c1bc77fc69cd10de51092d07fbebb5 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 2 Nov 2021 17:19:10 -0700
Subject: [PATCH 161/926] tools_common.h: add VPX_TOOLS_FORMAT_PRINTF

and use it to set the format attribute for printf like functions. this
allows the examples to be built with -Wformat-nonliteral without
producing warnings.

Bug: webm:1744
Change-Id: I26b4c41c9a42790053b1ae0e4a678af8f2cd1d82
Fixed: webm:1744
---
 configure                               |  1 +
 examples/svc_encodeframe.c              | 17 +++++++-----
 examples/twopass_encoder.c              |  2 +-
 examples/vp8_multi_resolution_encoder.c |  6 ++---
 examples/vpx_temporal_svc_encoder.c     |  4 +--
 rate_hist.c                             | 35 +++++++++++++------------
 tools_common.h                          | 18 ++++++++++---
 vpxenc.c                                | 12 +++++----
 vpxstats.c                              |  2 +-
 warnings.c                              |  2 +-
 10 files changed, 59 insertions(+), 40 deletions(-)

diff --git a/configure b/configure
index da631a45e1..e3babbe824 100755
--- a/configure
+++ b/configure
@@ -621,6 +621,7 @@ process_toolchain() {
         check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
         check_add_cflags -Wfloat-conversion
+        check_add_cflags -Wformat=2
         check_add_cflags -Wparentheses-equality
         check_add_cflags -Wpointer-arith
         check_add_cflags -Wtype-limits
diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c
index a73ee8ed66..08bda0e5c9 100644
--- a/examples/svc_encodeframe.c
+++ b/examples/svc_encodeframe.c
@@ -21,6 +21,7 @@
 #include <stdlib.h>
 #include <string.h>
 #define VPX_DISABLE_CTRL_TYPECHECKS 1
+#include "../tools_common.h"
 #include "./vpx_config.h"
 #include "./svc_context.h"
 #include "vpx/vp8cx.h"
@@ -95,8 +96,9 @@ static const SvcInternal_t *get_const_svc_internal(const SvcContext *svc_ctx) {
   return (const SvcInternal_t *)svc_ctx->internal;
 }
 
-static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
-                   ...) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 4) int svc_log(SvcContext *svc_ctx,
+                                                 SVC_LOG_LEVEL level,
+                                                 const char *fmt, ...) {
   char buf[512];
   int retval = 0;
   va_list ap;
@@ -264,7 +266,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
   if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
             "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
-            "enabled auto alt reference frame, but % layers are enabled\n",
+            "enabled auto alt reference frame, but %d layers are enabled\n",
             REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled);
     res = VPX_CODEC_INVALID_PARAM;
   }
@@ -456,10 +458,11 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
 
   if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "spatial layers * temporal layers exceeds the maximum number of "
-            "allowed layers of %d\n",
-            svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS);
+    svc_log(
+        svc_ctx, SVC_LOG_ERROR,
+        "spatial layers * temporal layers (%d) exceeds the maximum number of "
+        "allowed layers of %d\n",
+        svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS);
     return VPX_CODEC_INVALID_PARAM;
   }
   res = assign_layer_bitrates(svc_ctx, enc_cfg);
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 3d950b2c4b..07ba37dfd0 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -221,7 +221,7 @@ int main(int argc, char **argv) {
     die("Invalid frame size: %dx%d", w, h);
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
-    die("Failed to allocate image", w, h);
+    die("Failed to allocate image (%dx%d)", w, h);
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index e72f8a0197..62d96de557 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -352,7 +352,7 @@ int main(int argc, char **argv) {
   framerate = (int)strtol(argv[3], NULL, 0);
 
   if (width < 16 || width % 2 || height < 16 || height % 2)
-    die("Invalid resolution: %ldx%ld", width, height);
+    die("Invalid resolution: %dx%d", width, height);
 
   /* Open input video file for encoding */
   if (!(infile = fopen(argv[4], "rb")))
@@ -380,7 +380,7 @@ int main(int argc, char **argv) {
         (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
     if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
       die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
-          num_temporal_layers);
+          num_temporal_layers[i]);
   }
 
   /* Open file to write out each spatially downsampled input stream. */
@@ -468,7 +468,7 @@ int main(int argc, char **argv) {
   /* Allocate image for each encoder */
   for (i = 0; i < NUM_ENCODERS; i++)
     if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
-      die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+      die("Failed to allocate image (%dx%d)", cfg[i].g_w, cfg[i].g_h);
 
   if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
     read_frame_p = mulres_read_frame;
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index ad3e79c713..47f30751eb 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -687,14 +687,14 @@ int main(int argc, char **argv) {
             &raw,
             bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
             width, height, 32)) {
-      die("Failed to allocate image", width, height);
+      die("Failed to allocate image (%dx%d)", width, height);
     }
   }
 #else
   // Y4M reader has its own allocation.
   if (input_ctx.file_type != FILE_TYPE_Y4M) {
     if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
-      die("Failed to allocate image", width, height);
+      die("Failed to allocate image (%dx%d)", width, height);
     }
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/rate_hist.c b/rate_hist.c
index 6cf8ce7bb0..d10e754fee 100644
--- a/rate_hist.c
+++ b/rate_hist.c
@@ -193,7 +193,7 @@ static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
 
 static void show_histogram(const struct hist_bucket *bucket, int buckets,
                            int total, int scale) {
-  const char *pat1, *pat2;
+  int width1, width2;
   int i;
 
   assert(bucket != NULL);
@@ -201,32 +201,32 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets,
   switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
     case 1:
     case 2:
-      pat1 = "%4d %2s: ";
-      pat2 = "%4d-%2d: ";
+      width1 = 4;
+      width2 = 2;
       break;
     case 3:
-      pat1 = "%5d %3s: ";
-      pat2 = "%5d-%3d: ";
+      width1 = 5;
+      width2 = 3;
       break;
     case 4:
-      pat1 = "%6d %4s: ";
-      pat2 = "%6d-%4d: ";
+      width1 = 6;
+      width2 = 4;
       break;
     case 5:
-      pat1 = "%7d %5s: ";
-      pat2 = "%7d-%5d: ";
+      width1 = 7;
+      width2 = 5;
       break;
     case 6:
-      pat1 = "%8d %6s: ";
-      pat2 = "%8d-%6d: ";
+      width1 = 8;
+      width2 = 6;
       break;
     case 7:
-      pat1 = "%9d %7s: ";
-      pat2 = "%9d-%7d: ";
+      width1 = 9;
+      width2 = 7;
       break;
     default:
-      pat1 = "%12d %10s: ";
-      pat2 = "%12d-%10d: ";
+      width1 = 12;
+      width2 = 10;
       break;
   }
 
@@ -241,9 +241,10 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets,
     assert(len <= HIST_BAR_MAX);
 
     if (bucket[i].low == bucket[i].high)
-      fprintf(stderr, pat1, bucket[i].low, "");
+      fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, "");
     else
-      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+      fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2,
+              bucket[i].high);
 
     for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
     fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
diff --git a/tools_common.h b/tools_common.h
index 4e8851fc15..b9cfb9cc85 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -116,12 +116,24 @@ extern "C" {
 #define VPX_NO_RETURN
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef VPX_TOOLS_FORMAT_PRINTF
+#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-VPX_NO_RETURN void die(const char *fmt, ...);
-VPX_NO_RETURN void fatal(const char *fmt, ...);
-void warn(const char *fmt, ...);
+VPX_NO_RETURN void die(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
+VPX_NO_RETURN void fatal(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
+void warn(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
 
 VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s);
 
diff --git a/vpxenc.c b/vpxenc.c
index a0122ef804..b64b6cf441 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -58,8 +58,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
 
 static const char *exec_name;
 
-static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
-                                   const char *s, va_list ap) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv(
+    vpx_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) {
   if (ctx->err) {
     const char *detail = vpx_codec_error_detail(ctx);
 
@@ -72,7 +72,9 @@ static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
   }
 }
 
-static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) {
+static VPX_TOOLS_FORMAT_PRINTF(2,
+                               3) void ctx_exit_on_error(vpx_codec_ctx_t *ctx,
+                                                         const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -80,8 +82,8 @@ static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) {
   va_end(ap);
 }
 
-static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
-                                  const char *s, ...) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error(
+    vpx_codec_ctx_t *ctx, int fatal, const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
diff --git a/vpxstats.c b/vpxstats.c
index 142e367bb4..c0dd14e450 100644
--- a/vpxstats.c
+++ b/vpxstats.c
@@ -41,7 +41,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
     stats->buf.buf = malloc(stats->buf_alloc_sz);
 
     if (!stats->buf.buf)
-      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+      fatal("Failed to allocate first-pass stats buffer (%u bytes)",
             (unsigned int)stats->buf_alloc_sz);
 
     nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
diff --git a/warnings.c b/warnings.c
index a80da527f7..3e6e702536 100644
--- a/warnings.c
+++ b/warnings.c
@@ -98,7 +98,7 @@ void check_encoder_config(int disable_prompt,
   /* Count and print warnings. */
   for (warning = warning_list.warning_node; warning != NULL;
        warning = warning->next_warning, ++num_warnings) {
-    warn(warning->warning_string);
+    warn("%s", warning->warning_string);
   }
 
   free_warning_list(&warning_list);

From f3b95b1f56ed0b8d3fce8b998f431341f5d8c680 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 3 Nov 2021 16:23:06 -0700
Subject: [PATCH 162/926] update tools/cpplint.py

https://github.com/google/styleguide.git
100755 blob 4a82bde4f95cef8103520bc2c019483397ec51f4    cpplint/cpplint.py

Bug: aomedia:3178
Change-Id: I9e11d647096fc2082b18d74731026dabb52639bb
---
 tools/cpplint.py | 3442 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 2465 insertions(+), 977 deletions(-)

diff --git a/tools/cpplint.py b/tools/cpplint.py
index 25fbef73d8..e3ebde2f5a 100755
--- a/tools/cpplint.py
+++ b/tools/cpplint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -51,16 +51,23 @@
 import string
 import sys
 import unicodedata
+import sysconfig
+
+try:
+  xrange          # Python 2
+except NameError:
+  xrange = range  # Python 3
 
 
 _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
-                   [--linelength=digits]
+                   [--linelength=digits] [--headers=x,y,...]
+                   [--quiet]
         <file> [file] ...
 
   The style guidelines this tries to follow are those in
-    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+    https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
 
   Every problem is given a confidence score from 1-5, with 5 meaning we are
   certain of the problem, and 1 meaning it could be a legitimate construct.
@@ -83,6 +90,9 @@
     verbose=#
       Specify a number 0-5 to restrict errors to certain verbosity levels.
 
+    quiet
+      Don't print anything if no errors are found.
+
     filter=-x,+y,...
       Specify a comma-separated list of category-filters to apply: only
       error messages whose category names pass the filters will be printed.
@@ -114,12 +124,13 @@
       ignored.
 
       Examples:
-        Assuing that src/.git exists, the header guard CPP variables for
-        src/chrome/browser/ui/browser.h are:
+        Assuming that top/src/.git exists (and cwd=top/src), the header guard
+        CPP variables for top/src/chrome/browser/ui/browser.h are:
 
         No flag => CHROME_BROWSER_UI_BROWSER_H_
         --root=chrome => BROWSER_UI_BROWSER_H_
         --root=chrome/browser => UI_BROWSER_H_
+        --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_
 
     linelength=digits
       This is the allowed line length for the project. The default value is
@@ -133,6 +144,57 @@
 
       Examples:
         --extensions=hpp,cpp
+
+    headers=x,y,...
+      The header extensions that cpplint will treat as .h in checks. Values are
+      automatically added to --extensions list.
+
+      Examples:
+        --headers=hpp,hxx
+        --headers=hpp
+
+    cpplint.py supports per-directory configurations specified in CPPLINT.cfg
+    files. CPPLINT.cfg file can contain a number of key=value pairs.
+    Currently the following options are supported:
+
+      set noparent
+      filter=+filter1,-filter2,...
+      exclude_files=regex
+      linelength=80
+      root=subdir
+      headers=x,y,...
+
+    "set noparent" option prevents cpplint from traversing directory tree
+    upwards looking for more .cfg files in parent directories. This option
+    is usually placed in the top-level project directory.
+
+    The "filter" option is similar in function to --filter flag. It specifies
+    message filters in addition to the |_DEFAULT_FILTERS| and those specified
+    through --filter command-line flag.
+
+    "exclude_files" allows to specify a regular expression to be matched against
+    a file name. If the expression matches, the file is skipped and not run
+    through liner.
+
+    "linelength" allows to specify the allowed line length for the project.
+
+    The "root" option is similar in function to the --root flag (see example
+    above). Paths are relative to the directory of the CPPLINT.cfg.
+
+    The "headers" option is similar in function to the --headers flag
+    (see example above).
+
+    CPPLINT.cfg has an effect on files in the same directory and all
+    sub-directories, unless overridden by a nested configuration file.
+
+      Example file:
+        filter=-build/include_order,+build/include_alpha
+        exclude_files=.*\.cc
+
+    The above example disables build/include_order warning and enables
+    build/include_alpha as well as excludes all .cc from being
+    processed by linter, in the current directory (where the .cfg
+    file is located) and all sub-directories.
 """
 
 # We categorize each error message we print.  Here are the categories.
@@ -140,81 +202,101 @@
 # If you add a new error message with a new category, add it to the list
 # here!  cpplint_unittest.py should tell you if you forget to do this.
 _ERROR_CATEGORIES = [
-  'build/class',
-  'build/deprecated',
-  'build/endif_comment',
-  'build/explicit_make_pair',
-  'build/forward_decl',
-  'build/header_guard',
-  'build/include',
-  'build/include_alpha',
-  'build/include_order',
-  'build/include_what_you_use',
-  'build/namespaces',
-  'build/printf_format',
-  'build/storage_class',
-  'legal/copyright',
-  'readability/alt_tokens',
-  'readability/braces',
-  'readability/casting',
-  'readability/check',
-  'readability/constructors',
-  'readability/fn_size',
-  'readability/function',
-  'readability/multiline_comment',
-  'readability/multiline_string',
-  'readability/namespace',
-  'readability/nolint',
-  'readability/nul',
-  'readability/streams',
-  'readability/todo',
-  'readability/utf8',
-  'runtime/arrays',
-  'runtime/casting',
-  'runtime/explicit',
-  'runtime/int',
-  'runtime/init',
-  'runtime/invalid_increment',
-  'runtime/member_string_references',
-  'runtime/memset',
-  'runtime/operator',
-  'runtime/printf',
-  'runtime/printf_format',
-  'runtime/references',
-  'runtime/sizeof',
-  'runtime/string',
-  'runtime/threadsafe_fn',
-  'runtime/vlog',
-  'whitespace/blank_line',
-  'whitespace/braces',
-  'whitespace/comma',
-  'whitespace/comments',
-  'whitespace/empty_conditional_body',
-  'whitespace/empty_loop_body',
-  'whitespace/end_of_line',
-  'whitespace/ending_newline',
-  'whitespace/forcolon',
-  'whitespace/indent',
-  'whitespace/line_length',
-  'whitespace/newline',
-  'whitespace/operators',
-  'whitespace/parens',
-  'whitespace/semicolon',
-  'whitespace/tab',
-  'whitespace/todo'
-  ]
-
-# The default state of the category filter. This is overrided by the --filter=
+    'build/class',
+    'build/c++11',
+    'build/c++14',
+    'build/c++tr1',
+    'build/deprecated',
+    'build/endif_comment',
+    'build/explicit_make_pair',
+    'build/forward_decl',
+    'build/header_guard',
+    'build/include',
+    'build/include_alpha',
+    'build/include_order',
+    'build/include_what_you_use',
+    'build/namespaces',
+    'build/printf_format',
+    'build/storage_class',
+    'legal/copyright',
+    'readability/alt_tokens',
+    'readability/braces',
+    'readability/casting',
+    'readability/check',
+    'readability/constructors',
+    'readability/fn_size',
+    'readability/inheritance',
+    'readability/multiline_comment',
+    'readability/multiline_string',
+    'readability/namespace',
+    'readability/nolint',
+    'readability/nul',
+    'readability/strings',
+    'readability/todo',
+    'readability/utf8',
+    'runtime/arrays',
+    'runtime/casting',
+    'runtime/explicit',
+    'runtime/int',
+    'runtime/init',
+    'runtime/invalid_increment',
+    'runtime/member_string_references',
+    'runtime/memset',
+    'runtime/indentation_namespace',
+    'runtime/operator',
+    'runtime/printf',
+    'runtime/printf_format',
+    'runtime/references',
+    'runtime/string',
+    'runtime/threadsafe_fn',
+    'runtime/vlog',
+    'whitespace/blank_line',
+    'whitespace/braces',
+    'whitespace/comma',
+    'whitespace/comments',
+    'whitespace/empty_conditional_body',
+    'whitespace/empty_if_body',
+    'whitespace/empty_loop_body',
+    'whitespace/end_of_line',
+    'whitespace/ending_newline',
+    'whitespace/forcolon',
+    'whitespace/indent',
+    'whitespace/line_length',
+    'whitespace/newline',
+    'whitespace/operators',
+    'whitespace/parens',
+    'whitespace/semicolon',
+    'whitespace/tab',
+    'whitespace/todo',
+    ]
+
+# These error categories are no longer enforced by cpplint, but for backwards-
+# compatibility they may still appear in NOLINT comments.
+_LEGACY_ERROR_CATEGORIES = [
+    'readability/streams',
+    'readability/function',
+    ]
+
+# The default state of the category filter. This is overridden by the --filter=
 # flag. By default all errors are on, so only add here categories that should be
 # off by default (i.e., categories that must be enabled by the --filter= flags).
 # All entries here should start with a '-' or '+', as in the --filter= flag.
 _DEFAULT_FILTERS = ['-build/include_alpha']
 
+# The default list of categories suppressed for C (not C++) files.
+_DEFAULT_C_SUPPRESSED_CATEGORIES = [
+    'readability/casting',
+    ]
+
+# The default list of categories suppressed for Linux Kernel files.
+_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [
+    'whitespace/tab',
+    ]
+
 # We used to check for high-bit characters, but after much discussion we
 # decided those were OK, as long as they were in UTF-8 and didn't represent
 # hard-coded international strings, which belong in a separate i18n file.
 
-
 # C++ headers
 _CPP_HEADERS = frozenset([
     # Legacy
@@ -304,6 +386,7 @@
     'random',
     'ratio',
     'regex',
+    'scoped_allocator',
     'set',
     'sstream',
     'stack',
@@ -351,15 +434,40 @@
     'cwctype',
     ])
 
+# Type names
+_TYPES = re.compile(
+    r'^(?:'
+    # [dcl.type.simple]
+    r'(char(16_t|32_t)?)|wchar_t|'
+    r'bool|short|int|long|signed|unsigned|float|double|'
+    # [support.types]
+    r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|'
+    # [cstdint.syn]
+    r'(u?int(_fast|_least)?(8|16|32|64)_t)|'
+    r'(u?int(max|ptr)_t)|'
+    r')$')
+
+
+# These headers are excluded from [build/include] and [build/include_order]
+# checks:
+# - Anything not following google file name conventions (containing an
+#   uppercase character, such as Python.h or nsStringAPI.h, for example).
+# - Lua headers.
+_THIRD_PARTY_HEADERS_PATTERN = re.compile(
+    r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
+
+# Pattern for matching FileInfo.BaseName() against test file name
+_TEST_FILE_SUFFIX = r'(_test|_unittest|_regtest)$'
+
+# Pattern that matches only complete whitespace, possibly across multiple lines.
+_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL)
+
 # Assertion macros.  These are defined in base/logging.h and
-# testing/base/gunit.h.  Note that the _M versions need to come first
-# for substring matching to work.
+# testing/base/public/gunit.h.
 _CHECK_MACROS = [
     'DCHECK', 'CHECK',
-    'EXPECT_TRUE_M', 'EXPECT_TRUE',
-    'ASSERT_TRUE_M', 'ASSERT_TRUE',
-    'EXPECT_FALSE_M', 'EXPECT_FALSE',
-    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    'EXPECT_TRUE', 'ASSERT_TRUE',
+    'EXPECT_FALSE', 'ASSERT_FALSE',
     ]
 
 # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
@@ -372,16 +480,12 @@
   _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
   _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
   _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
 
 for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
                             ('>=', 'LT'), ('>', 'LE'),
                             ('<=', 'GT'), ('<', 'GE')]:
   _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
   _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
 
 # Alternative tokens and their replacements.  For full list, see section 2.5
 # Alternative tokens [lex.digraph] in the C++ standard.
@@ -430,11 +534,14 @@
                         r'(?:\s+(volatile|__volatile__))?'
                         r'\s*[{(]')
 
+# Match strings that indicate we're working on a C (not C++) file.
+_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|'
+                            r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))')
 
-_regexp_compile_cache = {}
+# Match string that indicates we're working on a Linux Kernel file.
+_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)')
 
-# Finds occurrences of NOLINT or NOLINT(...).
-_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+_regexp_compile_cache = {}
 
 # {str, set(int)}: a map from error categories to sets of linenumbers
 # on which those errors are expected and should be suppressed.
@@ -443,6 +550,7 @@
 # The root directory used for deriving header guard CPP variable.
 # This is set by --root flag.
 _root = None
+_root_debug = False
 
 # The allowed line length of files.
 # This is set by --linelength flag.
@@ -452,8 +560,28 @@
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc.
+# This is set by --headers flag.
+_hpp_headers = set(['h'])
+
+# {str, bool}: a map from error categories to booleans which indicate if the
+# category should be suppressed for every line.
+_global_error_suppressions = {}
+
+def ProcessHppHeadersOption(val):
+  global _hpp_headers
+  try:
+    _hpp_headers = set(val.split(','))
+    # Automatically append to extensions list so it does not have to be set 2 times
+    _valid_extensions.update(_hpp_headers)
+  except ValueError:
+    PrintUsage('Header extensions must be comma separated list.')
+
+def IsHeaderExtension(file_extension):
+  return file_extension in _hpp_headers
+
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
-  """Updates the global list of error-suppressions.
+  """Updates the global list of line error-suppressions.
 
   Parses any NOLINT comments on the current line, updating the global
   error_suppressions store.  Reports an error if the NOLINT comment
@@ -465,42 +593,67 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error):
     linenum: int, the number of the current line.
     error: function, an error handler.
   """
-  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
-  matched = _RE_SUPPRESSION.search(raw_line)
+  matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line)
   if matched:
-    category = matched.group(1)
+    if matched.group(1):
+      suppressed_line = linenum + 1
+    else:
+      suppressed_line = linenum
+    category = matched.group(2)
     if category in (None, '(*)'):  # => "suppress all"
-      _error_suppressions.setdefault(None, set()).add(linenum)
+      _error_suppressions.setdefault(None, set()).add(suppressed_line)
     else:
       if category.startswith('(') and category.endswith(')'):
         category = category[1:-1]
         if category in _ERROR_CATEGORIES:
-          _error_suppressions.setdefault(category, set()).add(linenum)
-        else:
+          _error_suppressions.setdefault(category, set()).add(suppressed_line)
+        elif category not in _LEGACY_ERROR_CATEGORIES:
           error(filename, linenum, 'readability/nolint', 5,
                 'Unknown NOLINT error category: %s' % category)
 
 
+def ProcessGlobalSuppresions(lines):
+  """Updates the list of global error suppressions.
+
+  Parses any lint directives in the file that have global effect.
+
+  Args:
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+  """
+  for line in lines:
+    if _SEARCH_C_FILE.search(line):
+      for category in _DEFAULT_C_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+    if _SEARCH_KERNEL_FILE.search(line):
+      for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+
+
 def ResetNolintSuppressions():
-  "Resets the set of NOLINT suppressions to empty."
+  """Resets the set of NOLINT suppressions to empty."""
   _error_suppressions.clear()
+  _global_error_suppressions.clear()
 
 
 def IsErrorSuppressedByNolint(category, linenum):
   """Returns true if the specified error category is suppressed on this line.
 
   Consults the global error_suppressions map populated by
-  ParseNolintSuppressions/ResetNolintSuppressions.
+  ParseNolintSuppressions/ProcessGlobalSuppresions/ResetNolintSuppressions.
 
   Args:
     category: str, the category of the error.
     linenum: int, the current line number.
   Returns:
-    bool, True iff the error should be suppressed due to a NOLINT comment.
+    bool, True iff the error should be suppressed due to a NOLINT comment or
+    global suppression.
   """
-  return (linenum in _error_suppressions.get(category, set()) or
+  return (_global_error_suppressions.get(category, False) or
+          linenum in _error_suppressions.get(category, set()) or
           linenum in _error_suppressions.get(None, set()))
 
+
 def Match(pattern, s):
   """Matches the string with the pattern, caching the compiled regexp."""
   # The regexp compilation caching is inlined in both Match and Search for
@@ -536,11 +689,17 @@ def Search(pattern, s):
   return _regexp_compile_cache[pattern].search(s)
 
 
-class _IncludeState(dict):
+def _IsSourceExtension(s):
+  """File extension (excluding dot) matches a source file extension."""
+  return s in ('c', 'cc', 'cpp', 'cxx')
+
+
+class _IncludeState(object):
   """Tracks line numbers for includes, and the order in which includes appear.
 
-  As a dict, an _IncludeState object serves as a mapping between include
-  filename and line number on which that file was included.
+  include_list contains list of lists of (header, line number) pairs.
+  It's a lists of lists rather than just one flat list to make it
+  easier to update across preprocessor boundaries.
 
   Call CheckNextIncludeOrder() once for each header in the file, passing
   in the type constants defined above. Calls in an illegal order will
@@ -571,15 +730,42 @@ class _IncludeState(dict):
       }
 
   def __init__(self):
-    dict.__init__(self)
-    self.ResetSection()
+    self.include_list = [[]]
+    self.ResetSection('')
+
+  def FindHeader(self, header):
+    """Check if a header has already been included.
+
+    Args:
+      header: header to check.
+    Returns:
+      Line number of previous occurrence, or -1 if the header has not
+      been seen before.
+    """
+    for section_list in self.include_list:
+      for f in section_list:
+        if f[0] == header:
+          return f[1]
+    return -1
+
+  def ResetSection(self, directive):
+    """Reset section checking for preprocessor directive.
 
-  def ResetSection(self):
+    Args:
+      directive: preprocessor directive (e.g. "if", "else").
+    """
     # The name of the current section.
     self._section = self._INITIAL_SECTION
     # The path of last found header.
     self._last_header = ''
 
+    # Update list of includes.  Note that we never pop from the
+    # include list.
+    if directive in ('if', 'ifdef', 'ifndef'):
+      self.include_list.append([])
+    elif directive in ('else', 'elif'):
+      self.include_list[-1] = []
+
   def SetLastHeader(self, header_path):
     self._last_header = header_path
 
@@ -615,7 +801,7 @@ def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
     # If previous line was a blank line, assume that the headers are
     # intentionally sorted the way they are.
     if (self._last_header > header_path and
-        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+        Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
       return False
     return True
 
@@ -681,8 +867,11 @@ def __init__(self):
     self.error_count = 0    # global count of reported errors
     # filters to apply when emitting error messages
     self.filters = _DEFAULT_FILTERS[:]
+    # backup of filter list. Used to restore the state after each file.
+    self._filters_backup = self.filters[:]
     self.counting = 'total'  # In what way are we counting errors?
     self.errors_by_category = {}  # string to int dict storing error counts
+    self.quiet = False  # Suppress non-error messagess?
 
     # output format:
     # "emacs" - format that emacs can parse (default)
@@ -693,6 +882,12 @@ def SetOutputFormat(self, output_format):
     """Sets the output format for errors."""
     self.output_format = output_format
 
+  def SetQuiet(self, quiet):
+    """Sets the module's quiet settings, and returns the previous setting."""
+    last_quiet = self.quiet
+    self.quiet = quiet
+    return last_quiet
+
   def SetVerboseLevel(self, level):
     """Sets the module's verbosity, and returns the previous setting."""
     last_verbose_level = self.verbose_level
@@ -719,6 +914,10 @@ def SetFilters(self, filters):
     """
     # Default filters always have less priority than the flag ones.
     self.filters = _DEFAULT_FILTERS[:]
+    self.AddFilters(filters)
+
+  def AddFilters(self, filters):
+    """ Adds more filters to the existing list of error-message filters. """
     for filt in filters.split(','):
       clean_filt = filt.strip()
       if clean_filt:
@@ -728,6 +927,14 @@ def SetFilters(self, filters):
         raise ValueError('Every filter in --filters must start with + or -'
                          ' (%s does not)' % filt)
 
+  def BackupFilters(self):
+    """ Saves the current filter list to backup storage."""
+    self._filters_backup = self.filters[:]
+
+  def RestoreFilters(self):
+    """ Restores filters previously backed up."""
+    self.filters = self._filters_backup[:]
+
   def ResetErrorCounts(self):
     """Sets the module's error statistic back to zero."""
     self.error_count = 0
@@ -748,7 +955,7 @@ def PrintErrorCounts(self):
     for category, count in self.errors_by_category.iteritems():
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
-    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+    sys.stdout.write('Total errors found: %d\n' % self.error_count)
 
 _cpplint_state = _CppLintState()
 
@@ -762,6 +969,14 @@ def _SetOutputFormat(output_format):
   """Sets the module's output format."""
   _cpplint_state.SetOutputFormat(output_format)
 
+def _Quiet():
+  """Return's the module's quiet setting."""
+  return _cpplint_state.quiet
+
+def _SetQuiet(quiet):
+  """Set the module's quiet status, and return previous setting."""
+  return _cpplint_state.SetQuiet(quiet)
+
 
 def _VerboseLevel():
   """Returns the module's verbosity setting."""
@@ -795,6 +1010,25 @@ def _SetFilters(filters):
   """
   _cpplint_state.SetFilters(filters)
 
+def _AddFilters(filters):
+  """Adds more filter overrides.
+
+  Unlike _SetFilters, this function does not reset the current list of filters
+  available.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.AddFilters(filters)
+
+def _BackupFilters():
+  """ Saves the current filter list to backup storage."""
+  _cpplint_state.BackupFilters()
+
+def _RestoreFilters():
+  """ Restores filters previously backed up."""
+  _cpplint_state.RestoreFilters()
 
 class _FunctionState(object):
   """Tracks current function name and the number of lines in its body."""
@@ -830,6 +1064,9 @@ def Check(self, error, filename, linenum):
       filename: The name of the current file.
       linenum: The number of the line to check.
     """
+    if not self.in_a_function:
+      return
+
     if Match(r'T(EST|est)', self.current_function):
       base_trigger = self._TEST_TRIGGER
     else:
@@ -857,7 +1094,7 @@ class _IncludeError(Exception):
   pass
 
 
-class FileInfo:
+class FileInfo(object):
   """Provides utility functions for filenames.
 
   FileInfo provides easy access to the components of a file's path
@@ -900,12 +1137,13 @@ def RepositoryName(self):
 
       # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
       # searching up from the current path.
-      root_dir = os.path.dirname(fullname)
-      while (root_dir != os.path.dirname(root_dir) and
-             not os.path.exists(os.path.join(root_dir, ".git")) and
-             not os.path.exists(os.path.join(root_dir, ".hg")) and
-             not os.path.exists(os.path.join(root_dir, ".svn"))):
-        root_dir = os.path.dirname(root_dir)
+      root_dir = current_dir = os.path.dirname(fullname)
+      while current_dir != os.path.dirname(current_dir):
+        if (os.path.exists(os.path.join(current_dir, ".git")) or
+            os.path.exists(os.path.join(current_dir, ".hg")) or
+            os.path.exists(os.path.join(current_dir, ".svn"))):
+          root_dir = current_dir
+        current_dir = os.path.dirname(current_dir)
 
       if (os.path.exists(os.path.join(root_dir, ".git")) or
           os.path.exists(os.path.join(root_dir, ".hg")) or
@@ -944,7 +1182,7 @@ def NoExtension(self):
 
   def IsSource(self):
     """File has a source file extension."""
-    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+    return _IsSourceExtension(self.Extension()[1:])
 
 
 def _ShouldPrintError(category, confidence, linenum):
@@ -955,6 +1193,7 @@ def _ShouldPrintError(category, confidence, linenum):
   # the verbosity level isn't high enough, or the filters filter it out.
   if IsErrorSuppressedByNolint(category, linenum):
     return False
+
   if confidence < _cpplint_state.verbose_level:
     return False
 
@@ -999,8 +1238,8 @@ def Error(filename, linenum, category, confidence, message):
   if _ShouldPrintError(category, confidence, linenum):
     _cpplint_state.IncrementErrorCount(category)
     if _cpplint_state.output_format == 'vs7':
-      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
+      sys.stderr.write('%s(%s): error cpplint: [%s] %s [%d]\n' % (
+          filename, linenum, category, message, confidence))
     elif _cpplint_state.output_format == 'eclipse':
       sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
           filename, linenum, message, category, confidence))
@@ -1012,11 +1251,9 @@ def Error(filename, linenum, category, confidence, message):
 # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
 _RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
     r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
-# Matches strings.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
-# Matches characters.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
-# Matches multi-line C++ comments.
+# Match a single C style comment on the same line.
+_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
+# Matches multi-line C style comments.
 # This RE is a little bit more complicated than one might expect, because we
 # have to take care of space removals tools so we can handle comments inside
 # statements better.
@@ -1025,10 +1262,10 @@ def Error(filename, linenum, category, confidence, message):
 # if this doesn't work we try on left side but only if there's a non-character
 # on the right.
 _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r"""(\s*/\*.*\*/\s*$|
-            /\*.*\*/\s+|
-         \s+/\*.*\*/(?=\W)|
-            /\*.*\*/)""", re.VERBOSE)
+    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' +
+    _RE_PATTERN_C_COMMENTS + r'\s+|' +
+    r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
+    _RE_PATTERN_C_COMMENTS + r')')
 
 
 def IsCppString(line):
@@ -1083,13 +1320,26 @@ def CleanseRawStrings(raw_lines):
         delimiter = None
       else:
         # Haven't found the end yet, append a blank line.
-        line = ''
+        line = '""'
 
-    else:
+    # Look for beginning of a raw string, and replace them with
+    # empty strings.  This is done in a loop to handle multiple raw
+    # strings on the same line.
+    while delimiter is None:
       # Look for beginning of a raw string.
       # See 2.14.15 [lex.string] for syntax.
-      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
-      if matched:
+      #
+      # Once we have matched a raw string, we check the prefix of the
+      # line to make sure that the line is not part of a single line
+      # comment.  It's done this way because we remove raw strings
+      # before removing comments as opposed to removing comments
+      # before removing raw strings.  This is because there are some
+      # cpplint checks that requires the comments to be preserved, but
+      # we don't want to check comments that are inside raw strings.
+      matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if (matched and
+          not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//',
+                    matched.group(1))):
         delimiter = ')' + matched.group(2) + '"'
 
         end = matched.group(3).find(delimiter)
@@ -1101,6 +1351,8 @@ def CleanseRawStrings(raw_lines):
         else:
           # Start of a multi-line raw string
           line = matched.group(1) + '""'
+      else:
+        break
 
     lines_without_raw_strings.append(line)
 
@@ -1131,10 +1383,10 @@ def FindNextMultiLineCommentEnd(lines, lineix):
 
 def RemoveMultiLineCommentsFromRange(lines, begin, end):
   """Clears a range of lines for multi-line comments."""
-  # Having // dummy comments makes the lines non-empty, so we will not get
+  # Having // <empty> comments makes the lines non-empty, so we will not get
   # unnecessary blank line warnings later in the code.
   for i in range(begin, end):
-    lines[i] = '// dummy'
+    lines[i] = '/**/'
 
 
 def RemoveMultiLineComments(filename, lines, error):
@@ -1170,12 +1422,14 @@ def CleanseComments(line):
 
 
 class CleansedLines(object):
-  """Holds 3 copies of all lines with different preprocessing applied to them.
+  """Holds 4 copies of all lines with different preprocessing applied to them.
 
-  1) elided member contains lines without strings and comments,
-  2) lines member contains lines without comments, and
+  1) elided member contains lines without strings and comments.
+  2) lines member contains lines without comments.
   3) raw_lines member contains all the lines without processing.
-  All these three members are of <type 'list'>, and of the same length.
+  4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw
+     strings removed.
+  All these members are of <type 'list'>, and of the same length.
   """
 
   def __init__(self, lines):
@@ -1206,38 +1460,138 @@ def _CollapseStrings(elided):
     Returns:
       The line with collapsed strings.
     """
-    if not _RE_PATTERN_INCLUDE.match(elided):
-      # Remove escaped characters first to make quote/single quote collapsing
-      # basic.  Things that look like escaped characters shouldn't occur
-      # outside of strings and chars.
-      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
-    return elided
+    if _RE_PATTERN_INCLUDE.match(elided):
+      return elided
+
+    # Remove escaped characters first to make quote/single quote collapsing
+    # basic.  Things that look like escaped characters shouldn't occur
+    # outside of strings and chars.
+    elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+
+    # Replace quoted strings and digit separators.  Both single quotes
+    # and double quotes are processed in the same loop, otherwise
+    # nested quotes wouldn't work.
+    collapsed = ''
+    while True:
+      # Find the first quote character
+      match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
+      if not match:
+        collapsed += elided
+        break
+      head, quote, tail = match.groups()
+
+      if quote == '"':
+        # Collapse double quoted strings
+        second_quote = tail.find('"')
+        if second_quote >= 0:
+          collapsed += head + '""'
+          elided = tail[second_quote + 1:]
+        else:
+          # Unmatched double quote, don't bother processing the rest
+          # of the line since this is probably a multiline string.
+          collapsed += elided
+          break
+      else:
+        # Found single quote, check nearby text to eliminate digit separators.
+        #
+        # There is no special handling for floating point here, because
+        # the integer/fractional/exponent parts would all be parsed
+        # correctly as long as there are digits on both sides of the
+        # separator.  So we are fine as long as we don't see something
+        # like "0.'3" (gcc 4.9.0 will not allow this literal).
+        if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
+          match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail)
+          collapsed += head + match_literal.group(1).replace("'", '')
+          elided = match_literal.group(2)
+        else:
+          second_quote = tail.find('\'')
+          if second_quote >= 0:
+            collapsed += head + "''"
+            elided = tail[second_quote + 1:]
+          else:
+            # Unmatched single quote
+            collapsed += elided
+            break
 
+    return collapsed
 
-def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
-  """Find the position just after the matching endchar.
+
+def FindEndOfExpressionInLine(line, startpos, stack):
+  """Find the position just after the end of current parenthesized expression.
 
   Args:
     line: a CleansedLines line.
     startpos: start searching at this position.
-    depth: nesting level at startpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at startpos.
 
   Returns:
-    On finding matching endchar: (index just after matching endchar, 0)
-    Otherwise: (-1, new depth at end of this line)
+    On finding matching end: (index just after matching end, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at end of this line)
   """
   for i in xrange(startpos, len(line)):
-    if line[i] == startchar:
-      depth += 1
-    elif line[i] == endchar:
-      depth -= 1
-      if depth == 0:
-        return (i + 1, 0)
-  return (-1, depth)
+    char = line[i]
+    if char in '([{':
+      # Found start of parenthesized expression, push to expression stack
+      stack.append(char)
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        if stack and stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (-1, None)
+      elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
+        # operator<, don't add to stack
+        continue
+      else:
+        # Tentative start of template argument list
+        stack.append('<')
+    elif char in ')]}':
+      # Found end of parenthesized expression.
+      #
+      # If we are currently expecting a matching '>', the pending '<'
+      # must have been an operator.  Remove them from expression stack.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((stack[-1] == '(' and char == ')') or
+          (stack[-1] == '[' and char == ']') or
+          (stack[-1] == '{' and char == '}')):
+        stack.pop()
+        if not stack:
+          return (i + 1, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == '>':
+      # Found potential end of template argument list.
+
+      # Ignore "->" and operator functions
+      if (i > 0 and
+          (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))):
+        continue
+
+      # Pop the stack if there is a matching '<'.  Otherwise, ignore
+      # this '>' since it must be an operator.
+      if stack:
+        if stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (i + 1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '>', the matching '<' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+  # Did not find end of expression or unbalanced parentheses on this line
+  return (-1, stack)
 
 
 def CloseExpression(clean_lines, linenum, pos):
@@ -1246,6 +1600,11 @@ def CloseExpression(clean_lines, linenum, pos):
   If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
   linenum/pos that correspond to the closing of the expression.
 
+  TODO(unknown): cpplint spends a fair bit of time matching parentheses.
+  Ideally we would want to index all opening and closing parentheses once
+  and have CloseExpression be just a simple lookup, but due to preprocessor
+  tricks, this is not so easy.
+
   Args:
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
@@ -1259,35 +1618,28 @@ def CloseExpression(clean_lines, linenum, pos):
   """
 
   line = clean_lines.elided[linenum]
-  startchar = line[pos]
-  if startchar not in '({[<':
+  if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
     return (line, clean_lines.NumLines(), -1)
-  if startchar == '(': endchar = ')'
-  if startchar == '[': endchar = ']'
-  if startchar == '{': endchar = '}'
-  if startchar == '<': endchar = '>'
 
   # Check first line
-  (end_pos, num_open) = FindEndOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
   if end_pos > -1:
     return (line, linenum, end_pos)
 
   # Continue scanning forward
-  while linenum < clean_lines.NumLines() - 1:
+  while stack and linenum < clean_lines.NumLines() - 1:
     linenum += 1
     line = clean_lines.elided[linenum]
-    (end_pos, num_open) = FindEndOfExpressionInLine(
-        line, 0, num_open, startchar, endchar)
+    (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
     if end_pos > -1:
       return (line, linenum, end_pos)
 
-  # Did not find endchar before end of file, give up
+  # Did not find end of expression before end of file, give up
   return (line, clean_lines.NumLines(), -1)
 
 
-def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
-  """Find position at the matching startchar.
+def FindStartOfExpressionInLine(line, endpos, stack):
+  """Find position at the matching start of current expression.
 
   This is almost the reverse of FindEndOfExpressionInLine, but note
   that the input position and returned position differs by 1.
@@ -1295,22 +1647,72 @@ def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
   Args:
     line: a CleansedLines line.
     endpos: start searching at this position.
-    depth: nesting level at endpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at endpos.
 
   Returns:
-    On finding matching startchar: (index at matching startchar, 0)
-    Otherwise: (-1, new depth at beginning of this line)
+    On finding matching start: (index at matching start, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at beginning of this line)
   """
-  for i in xrange(endpos, -1, -1):
-    if line[i] == endchar:
-      depth += 1
-    elif line[i] == startchar:
-      depth -= 1
-      if depth == 0:
-        return (i, 0)
-  return (-1, depth)
+  i = endpos
+  while i >= 0:
+    char = line[i]
+    if char in ')]}':
+      # Found end of expression, push to expression stack
+      stack.append(char)
+    elif char == '>':
+      # Found potential end of template argument list.
+      #
+      # Ignore it if it's a "->" or ">=" or "operator>"
+      if (i > 0 and
+          (line[i - 1] == '-' or
+           Match(r'\s>=\s', line[i - 1:]) or
+           Search(r'\boperator\s*$', line[0:i]))):
+        i -= 1
+      else:
+        stack.append('>')
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        i -= 1
+      else:
+        # If there is a matching '>', we can pop the expression stack.
+        # Otherwise, ignore this '<' since it must be an operator.
+        if stack and stack[-1] == '>':
+          stack.pop()
+          if not stack:
+            return (i, None)
+    elif char in '([{':
+      # Found start of expression.
+      #
+      # If there are any unmatched '>' on the stack, they must be
+      # operators.  Remove those.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((char == '(' and stack[-1] == ')') or
+          (char == '[' and stack[-1] == ']') or
+          (char == '{' and stack[-1] == '}')):
+        stack.pop()
+        if not stack:
+          return (i, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '<', the matching '>' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+    i -= 1
+
+  return (-1, stack)
 
 
 def ReverseCloseExpression(clean_lines, linenum, pos):
@@ -1331,30 +1733,23 @@ def ReverseCloseExpression(clean_lines, linenum, pos):
     return is the 'cleansed' line at linenum.
   """
   line = clean_lines.elided[linenum]
-  endchar = line[pos]
-  if endchar not in ')}]>':
+  if line[pos] not in ')}]>':
     return (line, 0, -1)
-  if endchar == ')': startchar = '('
-  if endchar == ']': startchar = '['
-  if endchar == '}': startchar = '{'
-  if endchar == '>': startchar = '<'
 
   # Check last line
-  (start_pos, num_open) = FindStartOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
   if start_pos > -1:
     return (line, linenum, start_pos)
 
   # Continue scanning backward
-  while linenum > 0:
+  while stack and linenum > 0:
     linenum -= 1
     line = clean_lines.elided[linenum]
-    (start_pos, num_open) = FindStartOfExpressionInLine(
-        line, len(line) - 1, num_open, startchar, endchar)
+    (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack)
     if start_pos > -1:
       return (line, linenum, start_pos)
 
-  # Did not find startchar before beginning of file, give up
+  # Did not find start of expression before beginning of file, give up
   return (line, 0, -1)
 
 
@@ -1362,7 +1757,7 @@ def CheckForCopyright(filename, lines, error):
   """Logs an error if no Copyright message appears at the top of the file."""
 
   # We'll say it should occur by line 10. Don't forget there's a
-  # dummy line at the front.
+  # placeholder line at the front.
   for line in xrange(1, min(len(lines), 11)):
     if re.search(r'Copyright', lines[line], re.I): break
   else:                       # means no copyright line was found
@@ -1371,6 +1766,46 @@ def CheckForCopyright(filename, lines, error):
           'You should have a line: "Copyright [year] <Copyright Owner>"')
 
 
+def GetIndentLevel(line):
+  """Return the number of leading spaces in line.
+
+  Args:
+    line: A string to check.
+
+  Returns:
+    An integer count of leading spaces, possibly zero.
+  """
+  indent = Match(r'^( *)\S', line)
+  if indent:
+    return len(indent.group(1))
+  else:
+    return 0
+
+def PathSplitToList(path):
+  """Returns the path split into a list by the separator.
+
+  Args:
+    path: An absolute or relative path (e.g. '/a/b/c/' or '../a')
+
+  Returns:
+    A list of path components (e.g. ['a', 'b', 'c]).
+  """
+  lst = []
+  while True:
+    (head, tail) = os.path.split(path)
+    if head == path: # absolute paths end
+      lst.append(head)
+      break
+    if tail == path: # relative paths end
+      lst.append(tail)
+      break
+
+    path = head
+    lst.append(tail)
+
+  lst.reverse()
+  return lst
+
 def GetHeaderGuardCPPVariable(filename):
   """Returns the CPP variable that should be used as a header guard.
 
@@ -1387,15 +1822,67 @@ def GetHeaderGuardCPPVariable(filename):
   # flymake.
   filename = re.sub(r'_flymake\.h$', '.h', filename)
   filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+  # Replace 'c++' with 'cpp'.
+  filename = filename.replace('C++', 'cpp').replace('c++', 'cpp')
 
   fileinfo = FileInfo(filename)
   file_path_from_root = fileinfo.RepositoryName()
-  if _root:
-    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
-  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
 
+  def FixupPathFromRoot():
+    if _root_debug:
+      sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n"
+          %(_root, fileinfo.RepositoryName()))
+
+    # Process the file path with the --root flag if it was set.
+    if not _root:
+      if _root_debug:
+        sys.stderr.write("_root unspecified\n")
+      return file_path_from_root
+
+    def StripListPrefix(lst, prefix):
+      # f(['x', 'y'], ['w, z']) -> None  (not a valid prefix)
+      if lst[:len(prefix)] != prefix:
+        return None
+      # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd']
+      return lst[(len(prefix)):]
+
+    # root behavior:
+    #   --root=subdir , lstrips subdir from the header guard
+    maybe_path = StripListPrefix(PathSplitToList(file_path_from_root),
+                                 PathSplitToList(_root))
+
+    if _root_debug:
+      sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," +
+          " _root=%s)\n") %(maybe_path, file_path_from_root, _root))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
+
+    #   --root=.. , will prepend the outer directory to the header guard
+    full_path = fileinfo.FullName()
+    root_abspath = os.path.abspath(_root)
+
+    maybe_path = StripListPrefix(PathSplitToList(full_path),
+                                 PathSplitToList(root_abspath))
+
+    if _root_debug:
+      sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " +
+          "root_abspath=%s)\n") %(maybe_path, full_path, root_abspath))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
 
-def CheckForHeaderGuard(filename, lines, error):
+    if _root_debug:
+      sys.stderr.write("_root ignore, returning %s\n" %(file_path_from_root))
+
+    #   --root=FAKE_DIR is ignored
+    return file_path_from_root
+
+  file_path_from_root = FixupPathFromRoot()
+  return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, clean_lines, error):
   """Checks that the file contains a header guard.
 
   Logs an error if no #ifndef header guard is present.  For other
@@ -1403,18 +1890,29 @@ def CheckForHeaderGuard(filename, lines, error):
 
   Args:
     filename: The name of the C++ header file.
-    lines: An array of strings, each representing a line of the file.
+    clean_lines: A CleansedLines instance containing the file.
     error: The function to call with any errors found.
   """
 
+  # Don't check for header guards if there are error suppression
+  # comments somewhere in this file.
+  #
+  # Because this is silencing a warning for a nonexistent line, we
+  # only support the very specific NOLINT(build/header_guard) syntax,
+  # and not the general NOLINT or NOLINT(*) syntax.
+  raw_lines = clean_lines.lines_without_raw_strings
+  for i in raw_lines:
+    if Search(r'//\s*NOLINT\(build/header_guard\)', i):
+      return
+
   cppvar = GetHeaderGuardCPPVariable(filename)
 
-  ifndef = None
+  ifndef = ''
   ifndef_linenum = 0
-  define = None
-  endif = None
+  define = ''
+  endif = ''
   endif_linenum = 0
-  for linenum, line in enumerate(lines):
+  for linenum, line in enumerate(raw_lines):
     linesplit = line.split()
     if len(linesplit) >= 2:
       # find the first occurrence of #ifndef and #define, save arg
@@ -1429,18 +1927,12 @@ def CheckForHeaderGuard(filename, lines, error):
       endif = line
       endif_linenum = linenum
 
-  if not ifndef:
+  if not ifndef or not define or ifndef != define:
     error(filename, 0, 'build/header_guard', 5,
           'No #ifndef header guard found, suggested CPP variable is: %s' %
           cppvar)
     return
 
-  if not define:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #define header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
   # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
   # for backward compatibility.
   if ifndef != cppvar:
@@ -1448,26 +1940,69 @@ def CheckForHeaderGuard(filename, lines, error):
     if ifndef != cppvar + '_':
       error_level = 5
 
-    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+    ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum,
                             error)
     error(filename, ifndef_linenum, 'build/header_guard', error_level,
           '#ifndef header guard has wrong style, please use: %s' % cppvar)
 
-  if define != ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
-          cppvar)
+  # Check for "//" comments on endif line.
+  ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
+                          error)
+  match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
+  if match:
+    if match.group(1) == '_':
+      # Issue low severity warning for deprecated double trailing underscore
+      error(filename, endif_linenum, 'build/header_guard', 0,
+            '#endif line should be "#endif  // %s"' % cppvar)
     return
 
-  if endif != ('#endif  // %s' % cppvar):
-    error_level = 0
-    if endif != ('#endif  // %s' % (cppvar + '_')):
-      error_level = 5
+  # Didn't find the corresponding "//" comment.  If this file does not
+  # contain any "//" comments at all, it could be that the compiler
+  # only wants "/**/" comments, look for those instead.
+  no_single_line_comments = True
+  for i in xrange(1, len(raw_lines) - 1):
+    line = raw_lines[i]
+    if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line):
+      no_single_line_comments = False
+      break
 
-    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
-                            error)
-    error(filename, endif_linenum, 'build/header_guard', error_level,
-          '#endif line should be "#endif  // %s"' % cppvar)
+  if no_single_line_comments:
+    match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+    if match:
+      if match.group(1) == '_':
+        # Low severity warning for double trailing underscore
+        error(filename, endif_linenum, 'build/header_guard', 0,
+              '#endif line should be "#endif  /* %s */"' % cppvar)
+      return
+
+  # Didn't find anything
+  error(filename, endif_linenum, 'build/header_guard', 5,
+        '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckHeaderFileIncluded(filename, include_state, error):
+  """Logs an error if a .cc file does not include its header."""
+
+  # Do not check test files
+  fileinfo = FileInfo(filename)
+  if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()):
+    return
+
+  headerfile = filename[0:len(filename) - len(fileinfo.Extension())] + '.h'
+  if not os.path.exists(headerfile):
+    return
+  headername = FileInfo(headerfile).RepositoryName()
+  first_include = 0
+  for section_list in include_state.include_list:
+    for f in section_list:
+      if headername in f[0] or f[0] in headername:
+        return
+      if not first_include:
+        first_include = f[1]
+
+  error(filename, first_include, 'build/include', 5,
+        '%s should include its header file %s' % (fileinfo.RepositoryName(),
+                                                  headername))
 
 
 def CheckForBadCharacters(filename, lines, error):
@@ -1551,19 +2086,33 @@ def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
           'Use C++11 raw strings or concatenation instead.')
 
 
-threading_list = (
-    ('asctime(', 'asctime_r('),
-    ('ctime(', 'ctime_r('),
-    ('getgrgid(', 'getgrgid_r('),
-    ('getgrnam(', 'getgrnam_r('),
-    ('getlogin(', 'getlogin_r('),
-    ('getpwnam(', 'getpwnam_r('),
-    ('getpwuid(', 'getpwuid_r('),
-    ('gmtime(', 'gmtime_r('),
-    ('localtime(', 'localtime_r('),
-    ('rand(', 'rand_r('),
-    ('strtok(', 'strtok_r('),
-    ('ttyname(', 'ttyname_r('),
+# (non-threadsafe name, thread-safe alternative, validation pattern)
+#
+# The validation pattern is used to eliminate false positives such as:
+#  _rand();               // false positive due to substring match.
+#  ->rand();              // some member function rand().
+#  ACMRandom rand(seed);  // some variable named rand.
+#  ISAACRandom rand();    // another variable named rand.
+#
+# Basically we require the return value of these functions to be used
+# in some expression context on the same line by matching on some
+# operator before the function name.  This eliminates constructors and
+# member function calls.
+_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)'
+_THREADING_LIST = (
+    ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'),
+    ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'),
+    ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'),
+    ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'),
+    ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'),
+    ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'),
+    ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'),
+    ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
+    ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
+    ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
+    ('strtok(', 'strtok_r(',
+     _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
+    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'),
     )
 
 
@@ -1583,14 +2132,13 @@ def CheckPosixThreading(filename, clean_lines, linenum, error):
     error: The function to call with any errors found.
   """
   line = clean_lines.elided[linenum]
-  for single_thread_function, multithread_safe_function in threading_list:
-    ix = line.find(single_thread_function)
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
-                                line[ix - 1] not in ('_', '.', '>'))):
+  for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
+    # Additional pattern matching check to confirm that this is the
+    # function we are looking for
+    if Search(pattern, line):
       error(filename, linenum, 'runtime/threadsafe_fn', 2,
-            'Consider using ' + multithread_safe_function +
-            '...) instead of ' + single_thread_function +
+            'Consider using ' + multithread_safe_func +
+            '...) instead of ' + single_thread_func +
             '...) for improved thread safety.')
 
 
@@ -1612,7 +2160,6 @@ def CheckVlogArguments(filename, clean_lines, linenum, error):
           'VLOG() should be used with numeric verbosity level.  '
           'Use LOG() if you want symbolic severity levels.')
 
-
 # Matches invalid increment: *count++, which moves pointer instead of
 # incrementing a value.
 _RE_PATTERN_INVALID_INCREMENT = re.compile(
@@ -1641,13 +2188,29 @@ def CheckInvalidIncrement(filename, clean_lines, linenum, error):
           'Changing pointer instead of value (or unused value of operator*).')
 
 
+def IsMacroDefinition(clean_lines, linenum):
+  if Search(r'^#define', clean_lines[linenum]):
+    return True
+
+  if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
+    return True
+
+  return False
+
+
+def IsForwardClassDeclaration(clean_lines, linenum):
+  return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
+
+
 class _BlockInfo(object):
   """Stores information about a generic block of code."""
 
-  def __init__(self, seen_open_brace):
+  def __init__(self, linenum, seen_open_brace):
+    self.starting_linenum = linenum
     self.seen_open_brace = seen_open_brace
     self.open_parentheses = 0
     self.inline_asm = _NO_ASM
+    self.check_namespace_indentation = False
 
   def CheckBegin(self, filename, clean_lines, linenum, error):
     """Run checks that applies to text up to the opening brace.
@@ -1677,15 +2240,33 @@ def CheckEnd(self, filename, clean_lines, linenum, error):
     """
     pass
 
+  def IsBlockInfo(self):
+    """Returns true if this block is a _BlockInfo.
+
+    This is convenient for verifying that an object is an instance of
+    a _BlockInfo, but not an instance of any of the derived classes.
+
+    Returns:
+      True for this class, False for derived classes.
+    """
+    return self.__class__ == _BlockInfo
+
+
+class _ExternCInfo(_BlockInfo):
+  """Stores information about an 'extern "C"' block."""
+
+  def __init__(self, linenum):
+    _BlockInfo.__init__(self, linenum, True)
+
 
 class _ClassInfo(_BlockInfo):
   """Stores information about a class."""
 
   def __init__(self, name, class_or_struct, clean_lines, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name
-    self.starting_linenum = linenum
     self.is_derived = False
+    self.check_namespace_indentation = True
     if class_or_struct == 'struct':
       self.access = 'public'
       self.is_struct = True
@@ -1695,11 +2276,7 @@ def __init__(self, name, class_or_struct, clean_lines, linenum):
 
     # Remember initial indentation level for this class.  Using raw_lines here
     # instead of elided to account for leading comments.
-    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
-    if initial_indent:
-      self.class_indent = len(initial_indent.group(1))
-    else:
-      self.class_indent = 0
+    self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
 
     # Try to find the end of the class.  This will be confused by things like:
     #   class A {
@@ -1721,6 +2298,23 @@ def CheckBegin(self, filename, clean_lines, linenum, error):
       self.is_derived = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
+    # If there is a DISALLOW macro, it should appear near the end of
+    # the class.
+    seen_last_thing_in_class = False
+    for i in xrange(linenum - 1, self.starting_linenum, -1):
+      match = Search(
+          r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' +
+          self.name + r'\)',
+          clean_lines.elided[i])
+      if match:
+        if seen_last_thing_in_class:
+          error(filename, i, 'readability/constructors', 3,
+                match.group(1) + ' should be the last thing in the class')
+        break
+
+      if not Match(r'^\s*$', clean_lines.elided[i]):
+        seen_last_thing_in_class = True
+
     # Check that closing brace is aligned with beginning of the class.
     # Only do this if the closing brace is indented by only whitespaces.
     # This means we will not check single-line class definitions.
@@ -1738,9 +2332,9 @@ class _NamespaceInfo(_BlockInfo):
   """Stores information about a namespace."""
 
   def __init__(self, name, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name or ''
-    self.starting_linenum = linenum
+    self.check_namespace_indentation = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
     """Check end of namespace comments."""
@@ -1758,7 +2352,7 @@ def CheckEnd(self, filename, clean_lines, linenum, error):
     # deciding what these nontrivial things are, so this check is
     # triggered by namespace size only, which works most of the time.
     if (linenum - self.starting_linenum < 10
-        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+        and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)):
       return
 
     # Look for matching comment at end of namespace.
@@ -1775,17 +2369,24 @@ def CheckEnd(self, filename, clean_lines, linenum, error):
     # expected namespace.
     if self.name:
       # Named namespace
-      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
-                    r'[\*/\.\\\s]*$'),
+      if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' +
+                    re.escape(self.name) + r'[\*/\.\\\s]*$'),
                    line):
         error(filename, linenum, 'readability/namespace', 5,
               'Namespace should be terminated with "// namespace %s"' %
               self.name)
     else:
       # Anonymous namespace
-      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace"')
+      if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        # If "// namespace anonymous" or "// anonymous namespace (more text)",
+        # mention "// anonymous namespace" as an acceptable form
+        if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line):
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"'
+                ' or "// anonymous namespace"')
+        else:
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"')
 
 
 class _PreprocessorInfo(object):
@@ -1802,7 +2403,7 @@ def __init__(self, stack_before_if):
     self.seen_else = False
 
 
-class _NestingState(object):
+class NestingState(object):
   """Holds states related to parsing braces."""
 
   def __init__(self):
@@ -1814,6 +2415,17 @@ def __init__(self):
     # - _BlockInfo: some other type of block.
     self.stack = []
 
+    # Top of the previous stack before each Update().
+    #
+    # Because the nesting_stack is updated at the end of each line, we
+    # had to do some convoluted checks to find out what is the current
+    # scope at the beginning of the line.  This check is simplified by
+    # saving the previous top of nesting stack.
+    #
+    # We could save the full stack, but we only need the top.  Copying
+    # the full nesting stack would slow down cpplint by ~10%.
+    self.previous_stack_top = []
+
     # Stack of _PreprocessorInfo objects.
     self.pp_stack = []
 
@@ -1834,6 +2446,82 @@ def InNamespaceBody(self):
     """
     return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
 
+  def InExternC(self):
+    """Check if we are currently one level inside an 'extern "C"' block.
+
+    Returns:
+      True if top of the stack is an extern block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ExternCInfo)
+
+  def InClassDeclaration(self):
+    """Check if we are currently one level inside a class or struct declaration.
+
+    Returns:
+      True if top of the stack is a class/struct, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ClassInfo)
+
+  def InAsmBlock(self):
+    """Check if we are currently one level inside an inline ASM block.
+
+    Returns:
+      True if the top of the stack is a block containing inline ASM.
+    """
+    return self.stack and self.stack[-1].inline_asm != _NO_ASM
+
+  def InTemplateArgumentList(self, clean_lines, linenum, pos):
+    """Check if current position is inside template argument list.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      pos: position just after the suspected template argument.
+    Returns:
+      True if (linenum, pos) is inside template arguments.
+    """
+    while linenum < clean_lines.NumLines():
+      # Find the earliest character that might indicate a template argument
+      line = clean_lines.elided[linenum]
+      match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
+      if not match:
+        linenum += 1
+        pos = 0
+        continue
+      token = match.group(1)
+      pos += len(match.group(0))
+
+      # These things do not look like template argument list:
+      #   class Suspect {
+      #   class Suspect x; }
+      if token in ('{', '}', ';'): return False
+
+      # These things look like template argument list:
+      #   template <class Suspect>
+      #   template <class Suspect = default_value>
+      #   template <class Suspect[]>
+      #   template <class Suspect...>
+      if token in ('>', '=', '[', ']', '.'): return True
+
+      # Check if token is an unmatched '<'.
+      # If not, move on to the next character.
+      if token != '<':
+        pos += 1
+        if pos >= len(line):
+          linenum += 1
+          pos = 0
+        continue
+
+      # We can't be sure if we just find a single '<', and need to
+      # find the matching '>'.
+      (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1)
+      if end_pos < 0:
+        # Not sure if template argument list or syntax error in file
+        return False
+      linenum = end_line
+      pos = end_pos
+    return False
+
   def UpdatePreprocessor(self, line):
     """Update preprocessor stack.
 
@@ -1890,6 +2578,7 @@ def UpdatePreprocessor(self, line):
         # TODO(unknown): unexpected #endif, issue warning?
         pass
 
+  # TODO(unknown): Update() is too long, but we will refactor later.
   def Update(self, filename, clean_lines, linenum, error):
     """Update nesting state with current line.
 
@@ -1901,7 +2590,17 @@ def Update(self, filename, clean_lines, linenum, error):
     """
     line = clean_lines.elided[linenum]
 
-    # Update pp_stack first
+    # Remember top of the previous nesting stack.
+    #
+    # The stack is always pushed/popped and not modified in place, so
+    # we can just do a shallow copy instead of copy.deepcopy.  Using
+    # deepcopy would slow down cpplint by ~28%.
+    if self.stack:
+      self.previous_stack_top = self.stack[-1]
+    else:
+      self.previous_stack_top = None
+
+    # Update pp_stack
     self.UpdatePreprocessor(line)
 
     # Count parentheses.  This is to avoid adding struct arguments to
@@ -1952,32 +2651,27 @@ def Update(self, filename, clean_lines, linenum, error):
     # such as in:
     #   class LOCKABLE API Object {
     #   };
-    #
-    # Templates with class arguments may confuse the parser, for example:
-    #   template <class T
-    #             class Comparator = less<T>,
-    #             class Vector = vector<T> >
-    #   class HeapQueue {
-    #
-    # Because this parser has no nesting state about templates, by the
-    # time it saw "class Comparator", it may think that it's a new class.
-    # Nested templates have a similar problem:
-    #   template <
-    #       typename ExportedType,
-    #       typename TupleType,
-    #       template <typename, typename> class ImplTemplate>
-    #
-    # To avoid these cases, we ignore classes that are followed by '=' or '>'
     class_decl_match = Match(
-        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
-        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+        r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
+        r'(.*)$', line)
     if (class_decl_match and
         (not self.stack or self.stack[-1].open_parentheses == 0)):
-      self.stack.append(_ClassInfo(
-          class_decl_match.group(4), class_decl_match.group(2),
-          clean_lines, linenum))
-      line = class_decl_match.group(5)
+      # We do not want to accept classes that are actually template arguments:
+      #   template <class Ignore1,
+      #             class Ignore2 = Default<Args>,
+      #             template <Args> class Ignore3>
+      #   void Function() {};
+      #
+      # To avoid template argument cases, we scan forward and look for
+      # an unmatched '>'.  If we see one, assume we are inside a
+      # template argument list.
+      end_declaration = len(class_decl_match.group(1))
+      if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration):
+        self.stack.append(_ClassInfo(
+            class_decl_match.group(3), class_decl_match.group(2),
+            clean_lines, linenum))
+        line = class_decl_match.group(4)
 
     # If we have not yet seen the opening brace for the innermost block,
     # run checks here.
@@ -2024,10 +2718,13 @@ def Update(self, filename, clean_lines, linenum, error):
         # stack otherwise.
         if not self.SeenOpenBrace():
           self.stack[-1].seen_open_brace = True
+        elif Match(r'^extern\s*"[^"]*"\s*\{', line):
+          self.stack.append(_ExternCInfo(linenum))
         else:
-          self.stack.append(_BlockInfo(True))
+          self.stack.append(_BlockInfo(linenum, True))
           if _MATCH_ASM.match(line):
             self.stack[-1].inline_asm = _BLOCK_ASM
+
       elif token == ';' or token == ')':
         # If we haven't seen an opening brace yet, but we already saw
         # a semicolon, this is probably a forward declaration.  Pop
@@ -2103,7 +2800,7 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -2136,7 +2833,8 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
             r'\s+(register|static|extern|typedef)\b',
             line):
     error(filename, linenum, 'build/storage_class', 5,
-          'Storage class (static, extern, typedef, etc) should be first.')
+          'Storage-class specifier (static, extern, typedef, etc) should be '
+          'at the beginning of the declaration.')
 
   if Match(r'\s*#\s*endif\s*[^/\s]+', line):
     error(filename, linenum, 'build/endif_comment', 5,
@@ -2176,26 +2874,79 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
 
   # Look for single-argument constructors that aren't marked explicit.
   # Technically a valid construct, but against style.
-  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
-               % re.escape(base_classname),
-               line)
-  if (args and
-      args.group(1) != 'void' and
-      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
-                % re.escape(base_classname), args.group(1).strip())):
-    error(filename, linenum, 'runtime/explicit', 5,
-          'Single-argument constructors should be marked explicit.')
-
-
-def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  explicit_constructor_match = Match(
+      r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?'
+      r'(?:(?:inline|constexpr)\s+)*%s\s*'
+      r'\(((?:[^()]|\([^()]*\))*)\)'
+      % re.escape(base_classname),
+      line)
+
+  if explicit_constructor_match:
+    is_marked_explicit = explicit_constructor_match.group(1)
+
+    if not explicit_constructor_match.group(2):
+      constructor_args = []
+    else:
+      constructor_args = explicit_constructor_match.group(2).split(',')
+
+    # collapse arguments so that commas in template parameter lists and function
+    # argument parameter lists don't split arguments in two
+    i = 0
+    while i < len(constructor_args):
+      constructor_arg = constructor_args[i]
+      while (constructor_arg.count('<') > constructor_arg.count('>') or
+             constructor_arg.count('(') > constructor_arg.count(')')):
+        constructor_arg += ',' + constructor_args[i + 1]
+        del constructor_args[i + 1]
+      constructor_args[i] = constructor_arg
+      i += 1
+
+    defaulted_args = [arg for arg in constructor_args if '=' in arg]
+    noarg_constructor = (not constructor_args or  # empty arg list
+                         # 'void' arg specifier
+                         (len(constructor_args) == 1 and
+                          constructor_args[0].strip() == 'void'))
+    onearg_constructor = ((len(constructor_args) == 1 and  # exactly one arg
+                           not noarg_constructor) or
+                          # all but at most one arg defaulted
+                          (len(constructor_args) >= 1 and
+                           not noarg_constructor and
+                           len(defaulted_args) >= len(constructor_args) - 1))
+    initializer_list_constructor = bool(
+        onearg_constructor and
+        Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
+    copy_constructor = bool(
+        onearg_constructor and
+        Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&'
+              % re.escape(base_classname), constructor_args[0].strip()))
+
+    if (not is_marked_explicit and
+        onearg_constructor and
+        not initializer_list_constructor and
+        not copy_constructor):
+      if defaulted_args:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Constructors callable with one argument '
+              'should be marked explicit.')
+      else:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Single-parameter constructors should be marked explicit.')
+    elif is_marked_explicit and not onearg_constructor:
+      if noarg_constructor:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Zero-parameter constructors should not be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
   """Checks for the correctness of various spacing around function calls.
 
   Args:
     filename: The name of the current file.
-    line: The text of the line to check.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
+  line = clean_lines.elided[linenum]
 
   # Since function calls often occur inside if/for/while/switch
   # expressions - which have their own, more liberal conventions - we
@@ -2238,10 +2989,18 @@ def CheckSpacingForFunctionCall(filename, line, linenum, error):
       error(filename, linenum, 'whitespace/parens', 2,
             'Extra space after (')
     if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall) and
-        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space before ( in function call')
+        not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and
+        not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
+        not Search(r'\bcase\s+\(', fncall)):
+      # TODO(unknown): Space after an operator function seem to be a common
+      # error, silence those for now by restricting them to highest verbosity.
+      if Search(r'\boperator_*\b', line):
+        error(filename, linenum, 'whitespace/parens', 0,
+              'Extra space before ( in function call')
+      else:
+        error(filename, linenum, 'whitespace/parens', 4,
+              'Extra space before ( in function call')
     # If the ) is followed only by a newline or a { + newline, assume it's
     # part of a control statement (if/while/etc), and don't complain
     if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
@@ -2270,12 +3029,26 @@ def IsBlankLine(line):
   return not line or line.isspace()
 
 
+def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                                 error):
+  is_namespace_indent_item = (
+      len(nesting_state.stack) > 1 and
+      nesting_state.stack[-1].check_namespace_indentation and
+      isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
+      nesting_state.previous_stack_top == nesting_state.stack[-2])
+
+  if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                     clean_lines.elided, line):
+    CheckItemIndentationInNamespace(filename, clean_lines.elided,
+                                    line, error)
+
+
 def CheckForFunctionLengths(filename, clean_lines, linenum,
                             function_state, error):
   """Reports for long function bodies.
 
   For an overview why this is done, see:
-  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+  https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
 
   Uses a simplistic algorithm assuming other style guidelines
   (especially spacing) are followed.
@@ -2295,8 +3068,6 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
   """
   lines = clean_lines.lines
   line = lines[linenum]
-  raw = clean_lines.raw_lines
-  raw_line = raw[linenum]
   joined_line = ''
 
   starting_func = False
@@ -2343,190 +3114,58 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
 _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
 
 
-def CheckComment(comment, filename, linenum, error):
-  """Checks for common mistakes in TODO comments.
+def CheckComment(line, filename, linenum, next_line_start, error):
+  """Checks for common mistakes in comments.
 
   Args:
-    comment: The text of the comment from the line in question.
+    line: The line in question.
     filename: The name of the current file.
     linenum: The number of the line to check.
+    next_line_start: The first non-whitespace column of the next line.
     error: The function to call with any errors found.
   """
-  match = _RE_PATTERN_TODO.match(comment)
-  if match:
-    # One whitespace is correct; zero whitespace is handled elsewhere.
-    leading_whitespace = match.group(1)
-    if len(leading_whitespace) > 1:
-      error(filename, linenum, 'whitespace/todo', 2,
-            'Too many spaces before TODO')
-
-    username = match.group(2)
-    if not username:
-      error(filename, linenum, 'readability/todo', 2,
-            'Missing username in TODO; it should look like '
-            '"// TODO(my_username): Stuff."')
-
-    middle_whitespace = match.group(3)
-    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-    if middle_whitespace != ' ' and middle_whitespace != '':
-      error(filename, linenum, 'whitespace/todo', 2,
-            'TODO(my_username) should be followed by a space')
-
-def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for improper use of DISALLOW* macros.
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0:
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
 
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                   r'DISALLOW_EVIL_CONSTRUCTORS|'
-                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-  if not matched:
-    return
-  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-    if nesting_state.stack[-1].access != 'private':
-      error(filename, linenum, 'readability/constructors', 3,
-            '%s must be in the private: section' % matched.group(1))
-
-  else:
-    # Found DISALLOW* macro outside a class declaration, or perhaps it
-    # was used inside a function when it should have been part of the
-    # class declaration.  We could issue a warning here, but it
-    # probably resulted in a compiler error already.
-    pass
-
-
-def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
-  """Find the corresponding > to close a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_suffix: Remainder of the current line after the initial <.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_suffix
-  nesting_stack = ['<']
-  while True:
-    # Find the next operator that can tell us whether < is used as an
-    # opening bracket or as a less-than operator.  We only want to
-    # warn on the latter case.
-    #
-    # We could also check all other operators and terminate the search
-    # early, e.g. if we got something like this "a<b+c", the "<" is
-    # most likely a less-than operator, but then we will get false
-    # positives for default arguments and other template expressions.
-    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(1)
-      line = match.group(2)
-
-      if nesting_stack[-1] == '<':
-        # Expecting closing angle bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator == '>':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma after a bracket, this is most likely a template
-          # argument.  We have not seen a closing angle bracket yet, but
-          # it's probably a few lines later if we look for it, so just
-          # return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting closing parenthesis or closing bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator in (')', ']'):
-          # We don't bother checking for matching () or [].  If we got
-          # something like (] or [), it would have been a syntax error.
-          nesting_stack.pop()
-
-    else:
-      # Scan the next line
-      linenum += 1
-      if linenum >= len(clean_lines.elided):
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all remaining lines and still no matching angle bracket.
-  # Most likely the input was incomplete, otherwise we should have
-  # seen a semicolon and returned early.
-  return True
-
-
-def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
-  """Find the corresponding < that started a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_prefix: Part of the current line before the initial >.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_prefix
-  nesting_stack = ['>']
-  while True:
-    # Find the previous operator
-    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(2)
-      line = match.group(1)
-
-      if nesting_stack[-1] == '>':
-        # Expecting opening angle bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator == '<':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma before a bracket, this is most likely a
-          # template argument.  The opening angle bracket is probably
-          # there if we look for it, so just return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting opening parenthesis or opening bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator in ('(', '['):
-          nesting_stack.pop()
-
-    else:
-      # Scan the previous line
-      linenum -= 1
-      if linenum < 0:
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all earlier lines and still no matching angle bracket.
-  return False
+      # Checks for common mistakes in TODO comments.
+      comment = line[commentpos:]
+      match = _RE_PATTERN_TODO.match(comment)
+      if match:
+        # One whitespace is correct; zero whitespace is handled elsewhere.
+        leading_whitespace = match.group(1)
+        if len(leading_whitespace) > 1:
+          error(filename, linenum, 'whitespace/todo', 2,
+                'Too many spaces before TODO')
+
+        username = match.group(2)
+        if not username:
+          error(filename, linenum, 'readability/todo', 2,
+                'Missing username in TODO; it should look like '
+                '"// TODO(my_username): Stuff."')
+
+        middle_whitespace = match.group(3)
+        # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+        if middle_whitespace != ' ' and middle_whitespace != '':
+          error(filename, linenum, 'whitespace/todo', 2,
+                'TODO(my_username) should be followed by a space')
+
+      # If the comment contains an alphanumeric character, there
+      # should be a space somewhere between it and the // unless
+      # it's a /// or //! Doxygen comment.
+      if (Match(r'//[^ ]*\w', comment) and
+          not Match(r'(///|//\!)(\s+|$)', comment)):
+        error(filename, linenum, 'whitespace/comments', 4,
+              'Should have a space between // and comment')
 
 
 def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
@@ -2542,7 +3181,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -2565,7 +3204,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   #   }
   #
   # A warning about missing end of namespace comments will be issued instead.
-  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+  #
+  # Also skip blank line checks for 'extern "C"' blocks, which are formatted
+  # like namespaces.
+  if (IsBlankLine(line) and
+      not nesting_state.InNamespaceBody() and
+      not nesting_state.InExternC()):
     elided = clean_lines.elided
     prev_line = elided[linenum - 1]
     prevbrace = prev_line.rfind('{')
@@ -2628,54 +3272,64 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
       error(filename, linenum, 'whitespace/blank_line', 3,
             'Do not leave a blank line after "%s:"' % matched.group(1))
 
-  # Next, we complain if there's a comment too near the text
-  commentpos = line.find('//')
-  if commentpos != -1:
-    # Check if the // may be in quotes.  If so, ignore it
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if (line.count('"', 0, commentpos) -
-        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
-      # Allow one space for new scopes, two spaces otherwise:
-      if (not Match(r'^\s*{ //', line) and
-          ((commentpos >= 1 and
-            line[commentpos-1] not in string.whitespace) or
-           (commentpos >= 2 and
-            line[commentpos-2] not in string.whitespace))):
-        error(filename, linenum, 'whitespace/comments', 2,
-              'At least two spaces is best between code and comments')
-      # There should always be a space between the // and the comment
-      commentend = commentpos + 2
-      if commentend < len(line) and not line[commentend] == ' ':
-        # but some lines are exceptions -- e.g. if they're big
-        # comment delimiters like:
-        # //----------------------------------------------------------
-        # or are an empty C++ style Doxygen comment, like:
-        # ///
-        # or C++ style Doxygen comments placed after the variable:
-        # ///<  Header comment
-        # //!<  Header comment
-        # or they begin with multiple slashes followed by a space:
-        # //////// Header comment
-        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
-                 Search(r'^/$', line[commentend:]) or
-                 Search(r'^!< ', line[commentend:]) or
-                 Search(r'^/< ', line[commentend:]) or
-                 Search(r'^/+ ', line[commentend:]))
-        if not match:
-          error(filename, linenum, 'whitespace/comments', 4,
-                'Should have a space between // and comment')
-      CheckComment(line[commentpos:], filename, linenum, error)
-
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  # Don't try to do spacing checks for operator methods
-  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+  # Next, check comments
+  next_line_start = 0
+  if linenum + 1 < clean_lines.NumLines():
+    next_line = raw[linenum + 1]
+    next_line_start = len(next_line) - len(next_line.lstrip())
+  CheckComment(line, filename, linenum, next_line_start, error)
+
+  # get rid of comments and strings
+  line = clean_lines.elided[linenum]
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'.
+  if Search(r'\w\s+\[', line) and not Search(r'(?:auto&?|delete|return)\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search(r'for *\(.*[^:]:[^: ]', line) or
+      Search(r'for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckOperatorSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around operators.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Don't try to do spacing checks for operator methods.  Do this by
+  # replacing the troublesome characters with something else,
+  # preserving column position for all other characters.
+  #
+  # The replacement is done repeatedly to avoid false positives from
+  # operators that call operators.
+  while True:
+    match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
+    if match:
+      line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
+    else:
+      break
 
   # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
   # Otherwise not.  Note we only check for non-spaces on *both* sides;
   # sometimes people put non-spaces on one side when aligning ='s among
   # many lines (not that this is behavior that I approve of...)
-  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+  if ((Search(r'[\w.]=', line) or
+       Search(r'=[\w.]', line))
+      and not Search(r'\b(if|while|for) ', line)
+      # Operators taken from [lex.operators] in C++11 standard.
+      and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line)
+      and not Search(r'operator=', line)):
     error(filename, linenum, 'whitespace/operators', 4,
           'Missing spaces around =')
 
@@ -2687,42 +3341,51 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   #
   # Check <= and >= first to avoid false positives with < and >, then
   # check non-include lines for spacing around < and >.
-  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  #
+  # If the operator is followed by a comma, assume it's be used in a
+  # macro context and don't do any checks.  This avoids false
+  # positives.
+  #
+  # Note that && is not included here.  This is because there are too
+  # many false positives due to RValue references.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
           'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << when used like this: 10<<20, but
-  # not otherwise (particularly, not when used as streams)
-  # Also ignore using ns::operator<<;
-  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
-  if (match and
-      not (match.group(1).isdigit() and match.group(2).isdigit()) and
-      not (match.group(1) == 'operator' and match.group(2) == ';')):
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around <<')
   elif not Match(r'#.*include', line):
-    # Avoid false positives on ->
-    reduced_line = line.replace('->', '')
-
     # Look for < that is not surrounded by spaces.  This is only
     # triggered if both sides are missing spaces, even though
     # technically should should flag if at least one side is missing a
     # space.  This is done to avoid some false positives with shifts.
-    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
-    if (match and
-        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around <')
+    match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+    if match:
+      (_, _, end_pos) = CloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if end_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around <')
 
     # Look for > that is not surrounded by spaces.  Similar to the
     # above, we only trigger if both sides are missing spaces to avoid
     # false positives with shifts.
-    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
-    if (match and
-        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
-                                             match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around >')
+    match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+    if match:
+      (_, _, start_pos) = ReverseCloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if start_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around >')
+
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  #
+  # We also allow operators following an opening parenthesis, since
+  # those tend to be macros that deal with operators.
+  match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line)
+  if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
 
   # We allow no-spaces around >> for almost anything.  This is because
   # C++11 allows ">>" to close nested templates, which accounts for
@@ -2747,7 +3410,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/operators', 4,
           'Extra space for operator %s' % match.group(1))
 
-  # A pet peeve of mine: no spaces after an if, while, switch, or for
+
+def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around parentheses.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # No spaces after an if, while, switch, or for
   match = Search(r' (if\(|for\(|while\(|switch\()', line)
   if match:
     error(filename, linenum, 'whitespace/parens', 5,
@@ -2773,6 +3448,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
             'Should have zero or one spaces inside ( and ) in %s' %
             match.group(1))
 
+
+def CheckCommaSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing near commas and semicolons.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  raw = clean_lines.lines_without_raw_strings
+  line = clean_lines.elided[linenum]
+
   # You should always have a space after a comma (either as fn arg or operator)
   #
   # This does not apply when the non-space character following the
@@ -2783,7 +3471,8 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   # verify that lines contain missing whitespaces, second pass on raw
   # lines to confirm that those missing whitespaces are not due to
   # elided comments.
-  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+  if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
+      Search(r',[^,\s]', raw[linenum])):
     error(filename, linenum, 'whitespace/comma', 3,
           'Missing space after ,')
 
@@ -2795,14 +3484,91 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/semicolon', 3,
           'Missing space after ;')
 
-  # Next we will look for issues with function calls.
-  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+def _IsType(clean_lines, nesting_state, expr):
+  """Check if expression looks like a type name, returns true if so.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    expr: The expression to check.
+  Returns:
+    True, if token looks like a type.
+  """
+  # Keep only the last token in the expression
+  last_word = Match(r'^.*(\b\S+)$', expr)
+  if last_word:
+    token = last_word.group(1)
+  else:
+    token = expr
+
+  # Match native types and stdint types
+  if _TYPES.match(token):
+    return True
+
+  # Try a bit harder to match templated types.  Walk up the nesting
+  # stack until we find something that resembles a typename
+  # declaration for what we are looking for.
+  typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) +
+                      r'\b')
+  block_index = len(nesting_state.stack) - 1
+  while block_index >= 0:
+    if isinstance(nesting_state.stack[block_index], _NamespaceInfo):
+      return False
+
+    # Found where the opening brace is.  We want to scan from this
+    # line up to the beginning of the function, minus a few lines.
+    #   template <typename Type1,  // stop scanning here
+    #             ...>
+    #   class C
+    #     : public ... {  // start scanning here
+    last_line = nesting_state.stack[block_index].starting_linenum
+
+    next_block_start = 0
+    if block_index > 0:
+      next_block_start = nesting_state.stack[block_index - 1].starting_linenum
+    first_line = last_line
+    while first_line >= next_block_start:
+      if clean_lines.elided[first_line].find('template') >= 0:
+        break
+      first_line -= 1
+    if first_line < next_block_start:
+      # Didn't find any "template" keyword before reaching the next block,
+      # there are probably no template things to check for this block
+      block_index -= 1
+      continue
+
+    # Look for typename in the specified range
+    for i in xrange(first_line, last_line + 1, 1):
+      if Search(typename_pattern, clean_lines.elided[i]):
+        return True
+    block_index -= 1
+
+  return False
+
+
+def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for horizontal spacing near commas.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
 
   # Except after an opening paren, or after another opening brace (in case of
   # an initializer list, for instance), you should have spaces before your
-  # braces. And since you should never have braces at the beginning of a line,
-  # this is an easy test.
-  match = Match(r'^(.*[^ ({]){', line)
+  # braces when they are delimiting blocks, classes, namespaces etc.
+  # And since you should never have braces at the beginning of a line,
+  # this is an easy test.  Except that braces used for initialization don't
+  # follow the same rule; we often don't want spaces before those.
+  match = Match(r'^(.*[^ ({>]){', line)
+
   if match:
     # Try a bit harder to check for brace initialization.  This
     # happens in one of the following forms:
@@ -2813,10 +3579,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     #   LastArgument(..., type{});
     #   LOG(INFO) << type{} << " ...";
     #   map_of_type[{...}] = ...;
+    #   ternary = expr ? new type{} : nullptr;
+    #   OuterTemplate<InnerTemplateConstructor<Type>{}>
     #
     # We check for the character following the closing brace, and
     # silence the warning if it's one of those listed above, i.e.
-    # "{.;,)<]".
+    # "{.;,)<>]:".
     #
     # To account for nested initializer list, we allow any number of
     # closing braces up to "{;,)<".  We can't simply silence the
@@ -2830,6 +3598,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     # There is a false negative with this approach if people inserted
     # spurious semicolons, e.g. "if (cond){};", but we will catch the
     # spurious semicolon with a separate check.
+    leading_text = match.group(1)
     (endline, endlinenum, endpos) = CloseExpression(
         clean_lines, linenum, len(match.group(1)))
     trailing_text = ''
@@ -2838,7 +3607,11 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     for offset in xrange(endlinenum + 1,
                          min(endlinenum + 3, clean_lines.NumLines() - 1)):
       trailing_text += clean_lines.elided[offset]
-    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+    # We also suppress warnings for `uint64_t{expression}` etc., as the style
+    # guide recommends brace initialization for integral types to avoid
+    # overflow/truncation.
+    if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text)
+        and not _IsType(clean_lines, nesting_state, leading_text)):
       error(filename, linenum, 'whitespace/braces', 5,
             'Missing space before {')
 
@@ -2847,12 +3620,6 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/braces', 5,
           'Missing space before else')
 
-  # You shouldn't have spaces before your brackets, except maybe after
-  # 'delete []' or 'new char * []'.
-  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Extra space before [')
-
   # You shouldn't have a space before a semicolon at the end of the line.
   # There's a special case for "for" since the style guide allows space before
   # the semicolon there.
@@ -2869,12 +3636,23 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
           'Extra space before last semicolon. If this should be an empty '
           'statement, use {} instead.')
 
-  # In range-based for, we wanted spaces before and after the colon, but
-  # not around "::" tokens that might appear.
-  if (Search('for *\(.*[^:]:[^: ]', line) or
-      Search('for *\(.*[^: ]:[^:]', line)):
-    error(filename, linenum, 'whitespace/forcolon', 2,
-          'Missing space around colon in range-based for loop')
+
+def IsDecltype(clean_lines, linenum, column):
+  """Check if the token ending on (linenum, column) is decltype().
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: the number of the line to check.
+    column: end column of the token to check.
+  Returns:
+    True if this token is decltype() expression, False otherwise.
+  """
+  (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
+  if start_col < 0:
+    return False
+  if Search(r'\bdecltype\s*$', text[0:start_col]):
+    return True
+  return False
 
 
 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
@@ -2974,15 +3752,18 @@ def CheckBraces(filename, clean_lines, linenum, error):
     # used for brace initializers inside function calls.  We don't detect this
     # perfectly: we just don't complain if the last non-whitespace character on
     # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-    # previous line starts a preprocessor block.
+    # previous line starts a preprocessor block. We also allow a brace on the
+    # following line if it is part of an array initialization and would not fit
+    # within the 80 character limit of the preceding line.
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if (not Search(r'[,;:}{(]\s*$', prevline) and
-        not Match(r'\s*#', prevline)):
+        not Match(r'\s*#', prevline) and
+        not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)):
       error(filename, linenum, 'whitespace/braces', 4,
             '{ should almost always be at the end of the previous line')
 
   # An else clause should be on the same line as the preceding closing brace.
-  if Match(r'\s*else\s*', line):
+  if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if Match(r'\s*}\s*$', prevline):
       error(filename, linenum, 'whitespace/newline', 4,
@@ -2990,19 +3771,20 @@ def CheckBraces(filename, clean_lines, linenum, error):
 
   # If braces come on one side of an else, they should be on both.
   # However, we have to worry about "else if" that spans multiple lines!
-  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
-      # find the ( after the if
-      pos = line.find('else if')
-      pos = line.find('(', pos)
-      if pos > 0:
-        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-        if endline[endpos:].find('{') == -1:    # must be brace after if
-          error(filename, linenum, 'readability/braces', 5,
-                'If an else has a brace on one side, it should have it on both')
-    else:            # common case: else not followed by a multi-line if
-      error(filename, linenum, 'readability/braces', 5,
-            'If an else has a brace on one side, it should have it on both')
+  if Search(r'else if\s*\(', line):       # could be multi-line if
+    brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
+    # find the ( after the if
+    pos = line.find('else if')
+    pos = line.find('(', pos)
+    if pos > 0:
+      (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+      brace_on_right = endline[endpos:].find('{') != -1
+      if brace_on_left != brace_on_right:    # must be brace after if
+        error(filename, linenum, 'readability/braces', 5,
+              'If an else has a brace on one side, it should have it on both')
+  elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    error(filename, linenum, 'readability/braces', 5,
+          'If an else has a brace on one side, it should have it on both')
 
   # Likewise, an else should never have the else clause on the same line
   if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
@@ -3014,11 +3796,79 @@ def CheckBraces(filename, clean_lines, linenum, error):
     error(filename, linenum, 'whitespace/newline', 4,
           'do/while clauses should not be on a single line')
 
+  # Check single-line if/else bodies. The style guide says 'curly braces are not
+  # required for single-line statements'. We additionally allow multi-line,
+  # single statements, but we reject anything with more than one semicolon in
+  # it. This means that the first semicolon after the if should be at the end of
+  # its line, and the line after that should have an indent level equal to or
+  # lower than the if. We also check for ambiguous if/else nesting without
+  # braces.
+  if_else_match = Search(r'\b(if\s*\(|else\b)', line)
+  if if_else_match and not Match(r'\s*#', line):
+    if_indent = GetIndentLevel(line)
+    endline, endlinenum, endpos = line, linenum, if_else_match.end()
+    if_match = Search(r'\bif\s*\(', line)
+    if if_match:
+      # This could be a multiline if condition, so find the end first.
+      pos = if_match.end() - 1
+      (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos)
+    # Check for an opening brace, either directly after the if or on the next
+    # line. If found, this isn't a single-statement conditional.
+    if (not Match(r'\s*{', endline[endpos:])
+        and not (Match(r'\s*$', endline[endpos:])
+                 and endlinenum < (len(clean_lines.elided) - 1)
+                 and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
+      while (endlinenum < len(clean_lines.elided)
+             and ';' not in clean_lines.elided[endlinenum][endpos:]):
+        endlinenum += 1
+        endpos = 0
+      if endlinenum < len(clean_lines.elided):
+        endline = clean_lines.elided[endlinenum]
+        # We allow a mix of whitespace and closing braces (e.g. for one-liner
+        # methods) and a single \ after the semicolon (for macros)
+        endpos = endline.find(';')
+        if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
+          # Semicolon isn't the last character, there's something trailing.
+          # Output a warning if the semicolon is not contained inside
+          # a lambda expression.
+          if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
+                       endline):
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+        elif endlinenum < len(clean_lines.elided) - 1:
+          # Make sure the next line is dedented
+          next_line = clean_lines.elided[endlinenum + 1]
+          next_indent = GetIndentLevel(next_line)
+          # With ambiguous nested if statements, this will error out on the
+          # if that *doesn't* match the else, regardless of whether it's the
+          # inner one or outer one.
+          if (if_match and Match(r'\s*else\b', next_line)
+              and next_indent != if_indent):
+            error(filename, linenum, 'readability/braces', 4,
+                  'Else clause should be indented at the same level as if. '
+                  'Ambiguous nested if/else chains require braces.')
+          elif next_indent > if_indent:
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+
+
+def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
+  """Looks for redundant trailing semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]
+
   # Block bodies should not be followed by a semicolon.  Due to C++11
   # brace initialization, there are more places where semicolons are
-  # required than not, so we use a whitelist approach to check these
-  # rather than a blacklist.  These are the places where "};" should
-  # be replaced by just "}":
+  # required than not, so we explicitly list the allowed rules rather
+  # than listing the disallowed ones.  These are the places where "};"
+  # should be replaced by just "}":
   # 1. Some flavor of block following closing parenthesis:
   #    for (;;) {};
   #    while (...) {};
@@ -3074,28 +3924,40 @@ def CheckBraces(filename, clean_lines, linenum, error):
     #  - INTERFACE_DEF
     #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
     #
-    # We implement a whitelist of safe macros instead of a blacklist of
+    # We implement a list of safe macros instead of a list of
     # unsafe macros, even though the latter appears less frequently in
     # google code and would have been easier to implement.  This is because
-    # the downside for getting the whitelist wrong means some extra
-    # semicolons, while the downside for getting the blacklist wrong
+    # the downside for getting the allowed checks wrong means some extra
+    # semicolons, while the downside for getting disallowed checks wrong
     # would result in compile errors.
     #
-    # In addition to macros, we also don't want to warn on compound
-    # literals.
+    # In addition to macros, we also don't want to warn on
+    #  - Compound literals
+    #  - Lambdas
+    #  - alignas specifier with anonymous structs
+    #  - decltype
     closing_brace_pos = match.group(1).rfind(')')
     opening_parenthesis = ReverseCloseExpression(
         clean_lines, linenum, closing_brace_pos)
     if opening_parenthesis[2] > -1:
       line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix)
+      func = Match(r'^(.*\])\s*$', line_prefix)
       if ((macro and
            macro.group(1) not in (
                'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
                'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
                'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
+          Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
+          Search(r'\bdecltype$', line_prefix) or
           Search(r'\s+=\s*$', line_prefix)):
         match = None
+    if (match and
+        opening_parenthesis[1] > 1 and
+        Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
+      # Multi-line lambda-expression
+      match = None
 
   else:
     # Try matching cases 2-3.
@@ -3125,6 +3987,14 @@ def CheckBraces(filename, clean_lines, linenum, error):
       # outputting warnings for the matching closing brace, if there are
       # nested blocks with trailing semicolons, we will get the error
       # messages in reversed order.
+
+      # We need to check the line forward for NOLINT
+      raw_lines = clean_lines.raw_lines
+      ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1,
+                              error)
+      ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum,
+                              error)
+
       error(filename, endlinenum, 'readability/braces', 4,
             "You don't need a ; after a }")
 
@@ -3148,7 +4018,7 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
   line = clean_lines.elided[linenum]
   matched = Match(r'\s*(for|while|if)\s*\(', line)
   if matched:
-    # Find the end of the conditional expression
+    # Find the end of the conditional expression.
     (end_line, end_linenum, end_pos) = CloseExpression(
         clean_lines, linenum, line.find('('))
 
@@ -3163,6 +4033,98 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
         error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
               'Empty loop bodies should use {} or continue')
 
+    # Check for if statements that have completely empty bodies (no comments)
+    # and no else clauses.
+    if end_pos >= 0 and matched.group(1) == 'if':
+      # Find the position of the opening { for the if statement.
+      # Return without logging an error if it has no brackets.
+      opening_linenum = end_linenum
+      opening_line_fragment = end_line[end_pos:]
+      # Loop until EOF or find anything that's not whitespace or opening {.
+      while not Search(r'^\s*\{', opening_line_fragment):
+        if Search(r'^(?!\s*$)', opening_line_fragment):
+          # Conditional has no brackets.
+          return
+        opening_linenum += 1
+        if opening_linenum == len(clean_lines.elided):
+          # Couldn't find conditional's opening { or any code before EOF.
+          return
+        opening_line_fragment = clean_lines.elided[opening_linenum]
+      # Set opening_line (opening_line_fragment may not be entire opening line).
+      opening_line = clean_lines.elided[opening_linenum]
+
+      # Find the position of the closing }.
+      opening_pos = opening_line_fragment.find('{')
+      if opening_linenum == end_linenum:
+        # We need to make opening_pos relative to the start of the entire line.
+        opening_pos += end_pos
+      (closing_line, closing_linenum, closing_pos) = CloseExpression(
+          clean_lines, opening_linenum, opening_pos)
+      if closing_pos < 0:
+        return
+
+      # Now construct the body of the conditional. This consists of the portion
+      # of the opening line after the {, all lines until the closing line,
+      # and the portion of the closing line before the }.
+      if (clean_lines.raw_lines[opening_linenum] !=
+          CleanseComments(clean_lines.raw_lines[opening_linenum])):
+        # Opening line ends with a comment, so conditional isn't empty.
+        return
+      if closing_linenum > opening_linenum:
+        # Opening line after the {. Ignore comments here since we checked above.
+        body = list(opening_line[opening_pos+1:])
+        # All lines until closing line, excluding closing line, with comments.
+        body.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum])
+        # Closing line before the }. Won't (and can't) have comments.
+        body.append(clean_lines.elided[closing_linenum][:closing_pos-1])
+        body = '\n'.join(body)
+      else:
+        # If statement has brackets and fits on a single line.
+        body = opening_line[opening_pos+1:closing_pos-1]
+
+      # Check if the body is empty
+      if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body):
+        return
+      # The body is empty. Now make sure there's not an else clause.
+      current_linenum = closing_linenum
+      current_line_fragment = closing_line[closing_pos:]
+      # Loop until EOF or find anything that's not whitespace or else clause.
+      while Search(r'^\s*$|^(?=\s*else)', current_line_fragment):
+        if Search(r'^(?=\s*else)', current_line_fragment):
+          # Found an else clause, so don't log an error.
+          return
+        current_linenum += 1
+        if current_linenum == len(clean_lines.elided):
+          break
+        current_line_fragment = clean_lines.elided[current_linenum]
+
+      # The body is empty and there's no else clause until EOF or other code.
+      error(filename, end_linenum, 'whitespace/empty_if_body', 4,
+            ('If statement had no body and no else clause'))
+
+
+def FindCheckMacro(line):
+  """Find a replaceable CHECK-like macro.
+
+  Args:
+    line: line to search on.
+  Returns:
+    (macro name, start position), or (None, -1) if no replaceable
+    macro is found.
+  """
+  for macro in _CHECK_MACROS:
+    i = line.find(macro)
+    if i >= 0:
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
+      if not matched:
+        continue
+      return (macro, len(matched.group(1)))
+  return (None, -1)
+
 
 def CheckCheck(filename, clean_lines, linenum, error):
   """Checks the use of CHECK and EXPECT macros.
@@ -3176,24 +4138,8 @@ def CheckCheck(filename, clean_lines, linenum, error):
 
   # Decide the set of replacement macros that should be suggested
   lines = clean_lines.elided
-  check_macro = None
-  start_pos = -1
-  for macro in _CHECK_MACROS:
-    i = lines[linenum].find(macro)
-    if i >= 0:
-      check_macro = macro
-
-      # Find opening parenthesis.  Do a regular expression match here
-      # to make sure that we are matching the expected CHECK macro, as
-      # opposed to some other macro that happens to contain the CHECK
-      # substring.
-      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
-      if not matched:
-        continue
-      start_pos = len(matched.group(1))
-      break
-  if not check_macro or start_pos < 0:
-    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+  (check_macro, start_pos) = FindCheckMacro(lines[linenum])
+  if not check_macro:
     return
 
   # Find end of the boolean expression by matching parentheses
@@ -3201,6 +4147,13 @@ def CheckCheck(filename, clean_lines, linenum, error):
       clean_lines, linenum, start_pos)
   if end_pos < 0:
     return
+
+  # If the check macro is followed by something other than a
+  # semicolon, assume users will log their own custom error messages
+  # and don't suggest any replacements.
+  if not Match(r'\s*;', last_line[end_pos:]):
+    return
+
   if linenum == end_line:
     expression = lines[linenum][start_pos + 1:end_pos - 1]
   else:
@@ -3223,7 +4176,7 @@ def CheckCheck(filename, clean_lines, linenum, error):
       if token == '(':
         # Parenthesized operand
         expression = matched.group(2)
-        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
         if end < 0:
           return  # Unmatched parenthesis
         lhs += '(' + expression[0:end]
@@ -3339,6 +4292,16 @@ def GetLineWidth(line):
       if unicodedata.east_asian_width(uc) in ('W', 'F'):
         width += 2
       elif not unicodedata.combining(uc):
+        # Issue 337
+        # https://mail.python.org/pipermail/python-list/2012-August/628809.html
+        if (sys.version_info.major, sys.version_info.minor) <= (3, 2):
+          # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81
+          is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4
+          # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564
+          is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF
+          if not is_wide_build and is_low_surrogate:
+            width -= 1
+
         width += 1
     return width
   else:
@@ -3358,7 +4321,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3368,6 +4331,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # raw strings,
   raw_lines = clean_lines.lines_without_raw_strings
   line = raw_lines[linenum]
+  prev = raw_lines[linenum - 1] if linenum > 0 else ''
 
   if line.find('\t') != -1:
     error(filename, linenum, 'whitespace/tab', 1,
@@ -3385,23 +4349,33 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # if(match($0, " <<")) complain = 0;
   # if(match(prev, " +for \\(")) complain = 0;
   # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
+  classinfo = nesting_state.InnermostClass()
   initial_spaces = 0
   cleansed_line = clean_lines.elided[linenum]
   while initial_spaces < len(line) and line[initial_spaces] == ' ':
     initial_spaces += 1
-  if line and line[-1].isspace():
-    error(filename, linenum, 'whitespace/end_of_line', 4,
-          'Line ends in whitespace.  Consider deleting these extra spaces.')
-  # There are certain situations we allow one space, notably for section labels
-  elif ((initial_spaces == 1 or initial_spaces == 3) and
-        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+  # There are certain situations we allow one space, notably for
+  # section labels, and also lines containing multi-line raw strings.
+  # We also don't check for lines that look like continuation lines
+  # (of lines ending in double quotes, commas, equals, or angle brackets)
+  # because the rules for how to indent those are non-trivial.
+  if (not Search(r'[",=><] *$', prev) and
+      (initial_spaces == 1 or initial_spaces == 3) and
+      not Match(scope_or_label_pattern, cleansed_line) and
+      not (clean_lines.raw_lines[linenum] != line and
+           Match(r'^\s*""', line))):
     error(filename, linenum, 'whitespace/indent', 3,
           'Weird number of spaces at line-start.  '
           'Are you using a 2-space indent?')
 
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+
   # Check if the line is a header guard.
   is_header_guard = False
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     cppvar = GetHeaderGuardCPPVariable(filename)
     if (line.startswith('#ifndef %s' % cppvar) or
         line.startswith('#define %s' % cppvar) or
@@ -3417,14 +4391,10 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # developers fault.
   if (not line.startswith('#include') and not is_header_guard and
       not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^\s*//\s*[^\s]*$', line) and
       not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
     line_width = GetLineWidth(line)
-    extended_length = int((_line_length * 1.25))
-    if line_width > extended_length:
-      error(filename, linenum, 'whitespace/line_length', 4,
-            'Lines should very rarely be longer than %i characters' %
-            extended_length)
-    elif line_width > _line_length:
+    if line_width > _line_length:
       error(filename, linenum, 'whitespace/line_length', 2,
             'Lines should be <= %i characters long' % _line_length)
 
@@ -3442,9 +4412,14 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
 
   # Some more style checks
   CheckBraces(filename, clean_lines, linenum, error)
+  CheckTrailingSemicolon(filename, clean_lines, linenum, error)
   CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
   CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckOperatorSpacing(filename, clean_lines, linenum, error)
+  CheckParenthesisSpacing(filename, clean_lines, linenum, error)
+  CheckCommaSpacing(filename, clean_lines, linenum, error)
+  CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
   CheckCheck(filename, clean_lines, linenum, error)
   CheckAltTokens(filename, clean_lines, linenum, error)
   classinfo = nesting_state.InnermostClass()
@@ -3452,7 +4427,6 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
 
 
-_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
 _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
 # Matches the first component of a filename delimited by -s and _s. That is:
 #  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
@@ -3489,23 +4463,6 @@ def _DropCommonSuffixes(filename):
   return os.path.splitext(filename)[0]
 
 
-def _IsTestFilename(filename):
-  """Determines if the given filename has a suffix that identifies it as a test.
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    True if 'filename' looks like a test, False otherwise.
-  """
-  if (filename.endswith('_test.cc') or
-      filename.endswith('_unittest.cc') or
-      filename.endswith('_regtest.cc')):
-    return True
-  else:
-    return False
-
-
 def _ClassifyInclude(fileinfo, include, is_system):
   """Figures out what kind of header 'include' is.
 
@@ -3581,11 +4538,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
     error: The function to call with any errors found.
   """
   fileinfo = FileInfo(filename)
-
   line = clean_lines.lines[linenum]
 
   # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+  # Only do this check if the included header follows google naming
+  # conventions.  If not, assume that it's a 3rd party API that
+  # requires special include conventions.
+  #
+  # We also make an exception for Lua headers, which follow google
+  # naming convention but not the include convention.
+  match = Match(r'#include\s*"([^/]+\.h)"', line)
+  if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
     error(filename, linenum, 'build/include', 4,
           'Include the directory when naming .h files')
 
@@ -3596,12 +4559,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
   if match:
     include = match.group(2)
     is_system = (match.group(1) == '<')
-    if include in include_state:
+    duplicate_line = include_state.FindHeader(include)
+    if duplicate_line >= 0:
       error(filename, linenum, 'build/include', 4,
             '"%s" already included at %s:%s' %
-            (include, filename, include_state[include]))
-    else:
-      include_state[include] = linenum
+            (include, filename, duplicate_line))
+    elif (include.endswith('.cc') and
+          os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)):
+      error(filename, linenum, 'build/include', 4,
+            'Do not include .cc files from other packages')
+    elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
+      include_state.include_list[-1].append((include, linenum))
 
       # We want to ensure that headers appear in the right order:
       # 1) for foo.cc, foo.h  (preferred location)
@@ -3627,15 +4595,6 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
               'Include "%s" not in alphabetical order' % include)
       include_state.SetLastHeader(canonical_include)
 
-  # Look for any of the stream classes that are part of standard C++.
-  match = _RE_PATTERN_INCLUDE.match(line)
-  if match:
-    include = match.group(2)
-    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
-      # Many unit tests use cout, so we exempt them.
-      if not _IsTestFilename(filename):
-        error(filename, linenum, 'readability/streams', 3,
-              'Streams are highly discouraged.')
 
 
 def _GetTextInside(text, start_pattern):
@@ -3658,7 +4617,7 @@ def _GetTextInside(text, start_pattern):
     The extracted text.
     None if either the opening string or ending punctuation could not be found.
   """
-  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # TODO(unknown): Audit cpplint.py to see what places could be profitably
   # rewritten to use _GetTextInside (and use inferior regexp matching today).
 
   # Give opening punctuations to get the matching close-punctuations.
@@ -3718,6 +4677,9 @@ def _GetTextInside(text, start_pattern):
 _RE_PATTERN_CONST_REF_PARAM = (
     r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
     r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+# Stream types.
+_RE_PATTERN_REF_STREAM_PARAM = (
+    r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')')
 
 
 def CheckLanguage(filename, clean_lines, linenum, file_extension,
@@ -3733,7 +4695,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
     include_state: An _IncludeState instance in which the headers are inserted.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3750,129 +4712,23 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
 
   # Reset include state across preprocessor directives.  This is meant
   # to silence warnings for conditional includes.
-  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
-    include_state.ResetSection()
+  match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
+  if match:
+    include_state.ResetSection(match.group(1))
 
   # Make Windows paths like Unix.
   fullname = os.path.abspath(filename).replace('\\', '/')
 
-  # TODO(unknown): figure out if they're using default arguments in fn proto.
+  # Perform other checks now that we are sure that this is not an include line
+  CheckCasts(filename, clean_lines, linenum, error)
+  CheckGlobalStatic(filename, clean_lines, linenum, error)
+  CheckPrintf(filename, clean_lines, linenum, error)
 
-  # Check to see if they're using an conversion function cast.
-  # I just try to capture the most common basic types, though there are more.
-  # Parameterless conversion functions, such as bool(), are allowed as they are
-  # probably a member operator declaration or default constructor.
-  match = Search(
-      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
-      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-      r'(\([^)].*)', line)
-  if match:
-    matched_new = match.group(1)
-    matched_type = match.group(2)
-    matched_funcptr = match.group(3)
-
-    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
-    # where type may be float(), int(string), etc.  Without context they are
-    # virtually indistinguishable from int(x) casts. Likewise, gMock's
-    # MockCallback takes a template parameter of the form return_type(arg_type),
-    # which looks much like the cast we're trying to detect.
-    #
-    # std::function<> wrapper has a similar problem.
-    #
-    # Return types for function pointers also look like casts if they
-    # don't have an extra space.
-    if (matched_new is None and  # If new operator, then this isn't a cast
-        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-             Search(r'\bMockCallback<.*>', line) or
-             Search(r'\bstd::function<.*>', line)) and
-        not (matched_funcptr and
-             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                   matched_funcptr))):
-      # Try a bit harder to catch gmock lines: the only place where
-      # something looks like an old-style cast is where we declare the
-      # return type of the mocked method, and the only time when we
-      # are missing context is if MOCK_METHOD was split across
-      # multiple lines.  The missing MOCK_METHOD is usually one or two
-      # lines back, so scan back one or two lines.
-      #
-      # It's not possible for gmock macros to appear in the first 2
-      # lines, since the class head + section name takes up 2 lines.
-      if (linenum < 2 or
-          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                     clean_lines.elided[linenum - 1]) or
-               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                     clean_lines.elided[linenum - 2]))):
-        error(filename, linenum, 'readability/casting', 4,
-              'Using deprecated casting style.  '
-              'Use static_cast<%s>(...) instead' %
-              matched_type)
-
-  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                  'static_cast',
-                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
-
-  # This doesn't catch all cases. Consider (const char * const)"hello".
-  #
-  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-  # compile).
-  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
-    pass
-  else:
-    # Check pointer casts for other than string constants
-    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
-
-  # In addition, we look for people taking the address of a cast.  This
-  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-  # point where you think.
-  match = Search(
-      r'(?:&\(([^)]+)\)[\w(])|'
-      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
-  if match and match.group(1) != '*':
-    error(filename, linenum, 'runtime/casting', 4,
-          ('Are you taking an address of a cast?  '
-           'This is dangerous: could be a temp var.  '
-           'Take the address before doing the cast, rather than after'))
-
-  # Create an extended_line, which is the concatenation of the current and
-  # next lines, for more effective checking of code that may span more than one
-  # line.
-  if linenum + 1 < clean_lines.NumLines():
-    extended_line = line + clean_lines.elided[linenum + 1]
-  else:
-    extended_line = line
-
-  # Check for people declaring static/global STL strings at the top level.
-  # This is dangerous because the C++ language does not guarantee that
-  # globals with constructors are initialized before the first access.
-  match = Match(
-      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-      line)
-  # Make sure it's not a function.
-  # Function template specialization looks like: "string foo<Type>(...".
-  # Class template definitions look like: "string Foo<Type>::Method(...".
-  #
-  # Also ignore things that look like operators.  These are matched separately
-  # because operator names cross non-word boundaries.  If we change the pattern
-  # above, we would decrease the accuracy of matching identifiers.
-  if (match and
-      not Search(r'\boperator\W', line) and
-      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
-    error(filename, linenum, 'runtime/string', 4,
-          'For a static/global string constant, use a C style string instead: '
-          '"%schar %s[]".' %
-          (match.group(1), match.group(2)))
-
-  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-    error(filename, linenum, 'runtime/init', 4,
-          'You seem to be initializing a member variable with itself.')
-
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     # TODO(unknown): check that 1-arg constructors are explicit.
     #                How to tell it's a constructor?
     #                (handled in CheckForNonStandardConstructs for now)
-    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    # TODO(unknown): check that classes declare or disable copy/assign
     #                (level 1 error)
     pass
 
@@ -3888,23 +4744,6 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
       error(filename, linenum, 'runtime/int', 4,
             'Use int16/int64/etc, rather than the C type %s' % match.group(1))
 
-  # When snprintf is used, the second argument shouldn't be a literal.
-  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-  if match and match.group(2) != '0':
-    # If 2nd arg is zero, snprintf is used to calculate size.
-    error(filename, linenum, 'runtime/printf', 3,
-          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-          'to snprintf.' % (match.group(1), match.group(2)))
-
-  # Check if some verboten C functions are being used.
-  if Search(r'\bsprintf\b', line):
-    error(filename, linenum, 'runtime/printf', 5,
-          'Never use sprintf.  Use snprintf instead.')
-  match = Search(r'\b(strcpy|strcat)\b', line)
-  if match:
-    error(filename, linenum, 'runtime/printf', 4,
-          'Almost always, snprintf is better than %s' % match.group(1))
-
   # Check if some verboten operator overloading is going on
   # TODO(unknown): catch out-of-line unary operator&:
   #   class X {};
@@ -3924,7 +4763,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
   # Check for potential format string bugs like printf(foo).
   # We constrain the pattern not to pick things like DocidForPrintf(foo).
   # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # TODO(unknown): Catch the following case. Need to change the calling
   # convention of the whole function to process multiple line to handle it.
   #   printf(
   #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
@@ -3989,37 +4828,188 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
             'Do not use variable-length arrays.  Use an appropriately named '
             "('k' followed by CamelCase) compile-time constant for the size.")
 
-  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
-  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
-  # in the class declaration.
-  match = Match(
-      (r'\s*'
-       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
-       r'\(.*\);$'),
-      line)
-  if match and linenum + 1 < clean_lines.NumLines():
-    next_line = clean_lines.elided[linenum + 1]
-    # We allow some, but not all, declarations of variables to be present
-    # in the statement that defines the class.  The [\w\*,\s]* fragment of
-    # the regular expression below allows users to declare instances of
-    # the class or pointers to instances, but not less common types such
-    # as function pointers or arrays.  It's a tradeoff between allowing
-    # reasonable code and avoiding trying to parse more C++ using regexps.
-    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
-      error(filename, linenum, 'readability/constructors', 3,
-            match.group(1) + ' should be the last thing in the class')
-
   # Check for use of unnamed namespaces in header files.  Registration
   # macros are typically OK, so we allow use of "namespace {" on lines
   # that end with backslashes.
-  if (file_extension == 'h'
+  if (IsHeaderExtension(file_extension)
       and Search(r'\bnamespace\s*{', line)
       and line[-1] != '\\'):
     error(filename, linenum, 'build/namespaces', 4,
           'Do not use unnamed namespaces in header files.  See '
-          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
           ' for more information.')
 
+
+def CheckGlobalStatic(filename, clean_lines, linenum, error):
+  """Check for unsafe global or static objects.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Match two lines at a time to support multiline declarations
+  if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
+    line += clean_lines.elided[linenum + 1].strip()
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access, and
+  # also because globals can be destroyed when some threads are still running.
+  # TODO(unknown): Generalize this to also find static unique_ptr instances.
+  # TODO(unknown): File bugs for clang-tidy to find these.
+  match = Match(
+      r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +'
+      r'([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+
+  # Remove false positives:
+  # - String pointers (as opposed to values).
+  #    string *pointer
+  #    const string *pointer
+  #    string const *pointer
+  #    string *const pointer
+  #
+  # - Functions and template specializations.
+  #    string Function<Type>(...
+  #    string Class<Type>::Method(...
+  #
+  # - Operators.  These are matched separately because operator names
+  #   cross non-word boundaries, and trying to match both operators
+  #   and functions at the same time would decrease accuracy of
+  #   matching identifiers.
+  #    string Class::operator*()
+  if (match and
+      not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))):
+    if Search(r'\bconst\b', line):
+      error(filename, linenum, 'runtime/string', 4,
+            'For a static/global string constant, use a C style string '
+            'instead: "%schar%s %s[]".' %
+            (match.group(1), match.group(2) or '', match.group(3)))
+    else:
+      error(filename, linenum, 'runtime/string', 4,
+            'Static/global string variables are not permitted.')
+
+  if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or
+      Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+
+def CheckPrintf(filename, clean_lines, linenum, error):
+  """Check for printf related issues.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\s*\(', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf. Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\s*\(', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+
+def IsDerivedFunction(clean_lines, linenum):
+  """Check if current line contains an inherited function.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains a function with "override"
+    virt-specifier.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
+    if match:
+      # Look for "override" after the matching closing parenthesis
+      line, _, closing_paren = CloseExpression(
+          clean_lines, i, len(match.group(1)))
+      return (closing_paren >= 0 and
+              Search(r'\boverride\b', line[closing_paren:]))
+  return False
+
+
+def IsOutOfLineMethodDefinition(clean_lines, linenum):
+  """Check if current line contains an out-of-line method definition.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains an out-of-line method definition.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
+      return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None
+  return False
+
+
+def IsInitializerList(clean_lines, linenum):
+  """Check if current line is inside constructor initializer list.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line appears to be inside constructor initializer
+    list, False otherwise.
+  """
+  for i in xrange(linenum, 1, -1):
+    line = clean_lines.elided[i]
+    if i == linenum:
+      remove_function_body = Match(r'^(.*)\{\s*$', line)
+      if remove_function_body:
+        line = remove_function_body.group(1)
+
+    if Search(r'\s:\s*\w+[({]', line):
+      # A lone colon tend to indicate the start of a constructor
+      # initializer list.  It could also be a ternary operator, which
+      # also tend to appear in constructor initializer lists as
+      # opposed to parameter lists.
+      return True
+    if Search(r'\}\s*,\s*$', line):
+      # A closing brace followed by a comma is probably the end of a
+      # brace-initialized member in constructor initializer list.
+      return True
+    if Search(r'[{};]\s*$', line):
+      # Found one of the following:
+      # - A closing brace or semicolon, probably the end of the previous
+      #   function.
+      # - An opening brace, probably the start of current class or namespace.
+      #
+      # Current line is probably not inside an initializer list since
+      # we saw one of those things without seeing the starting colon.
+      return False
+
+  # Got to the beginning of the file without seeing the start of
+  # constructor initializer list.
+  return False
+
+
 def CheckForNonConstReference(filename, clean_lines, linenum,
                               nesting_state, error):
   """Check for non-const references.
@@ -4031,7 +5021,7 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -4040,6 +5030,17 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
   if '&' not in line:
     return
 
+  # If a function is inherited, current function doesn't have much of
+  # a choice, so any non-const references should not be blamed on
+  # derived function.
+  if IsDerivedFunction(clean_lines, linenum):
+    return
+
+  # Don't warn on out-of-line method definitions, as we would warn on the
+  # in-line declaration, if it isn't marked with 'override'.
+  if IsOutOfLineMethodDefinition(clean_lines, linenum):
+    return
+
   # Long type names may be broken across multiple lines, usually in one
   # of these forms:
   #   LongType
@@ -4088,60 +5089,192 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
   #   inside declarators: reference parameter
   # We will exclude the first two cases by checking that we are not inside a
   # function body, including one that was just introduced by a trailing '{'.
-  # TODO(unknwon): Doesn't account for preprocessor directives.
   # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-  check_params = False
-  if not nesting_state.stack:
-    check_params = True  # top level
-  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
-        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-    check_params = True  # within class or namespace
-  elif Match(r'.*{\s*$', line):
-    if (len(nesting_state.stack) == 1 or
-        isinstance(nesting_state.stack[-2], _ClassInfo) or
-        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
-      check_params = True  # just opened global/class/namespace block
+  if (nesting_state.previous_stack_top and
+      not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
+           isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
+    # Not at toplevel, not within a class, and not within a namespace
+    return
+
+  # Avoid initializer lists.  We only need to scan back from the
+  # current line for something that starts with ':'.
+  #
+  # We don't need to check the current line, since the '&' would
+  # appear inside the second set of parentheses on the current line as
+  # opposed to the first set.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 10), -1):
+      previous_line = clean_lines.elided[i]
+      if not Search(r'[),]\s*$', previous_line):
+        break
+      if Match(r'^\s*:\s+\S', previous_line):
+        return
+
+  # Avoid preprocessors
+  if Search(r'\\\s*$', line):
+    return
+
+  # Avoid constructor initializer lists
+  if IsInitializerList(clean_lines, linenum):
+    return
+
   # We allow non-const references in a few standard places, like functions
   # called "swap()" or iostream operators like "<<" or ">>".  Do not check
   # those function parameters.
   #
   # We also accept & in static_assert, which looks like a function but
   # it's actually a declaration expression.
-  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+  allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
                            r'operator\s*[<>][<>]|'
                            r'static_assert|COMPILE_ASSERT'
                            r')\s*\(')
-  if Search(whitelisted_functions, line):
-    check_params = False
+  if Search(allowed_functions, line):
+    return
   elif not Search(r'\S+\([^)]*$', line):
-    # Don't see a whitelisted function on this line.  Actually we
+    # Don't see an allowed function on this line.  Actually we
     # didn't see any function name on this line, so this is likely a
     # multi-line parameter list.  Try a bit harder to catch this case.
     for i in xrange(2):
       if (linenum > i and
-          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
-        check_params = False
-        break
+          Search(allowed_functions, clean_lines.elided[linenum - i - 1])):
+        return
 
-  if check_params:
-    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-        error(filename, linenum, 'runtime/references', 2,
-              'Is this a non-const reference? '
-              'If so, make const or use a pointer: ' +
-              ReplaceAll(' *<', '<', parameter))
+  decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+  for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+    if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and
+        not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)):
+      error(filename, linenum, 'runtime/references', 2,
+            'Is this a non-const reference? '
+            'If so, make const or use a pointer: ' +
+            ReplaceAll(' *<', '<', parameter))
 
 
-def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
-                    error):
+def CheckCasts(filename, clean_lines, linenum, error):
+  """Various cast related checks.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b'
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
+  if match and not expecting_function:
+    matched_type = match.group(2)
+
+    # matched_new_or_template is used to silence two false positives:
+    # - New operators
+    # - Template arguments with function types
+    #
+    # For template arguments, we match on types immediately following
+    # an opening bracket without any spaces.  This is a fast way to
+    # silence the common case where the function type is the first
+    # template argument.  False negative with less-than comparison is
+    # avoided because those operators are usually followed by a space.
+    #
+    #   function<double(double)>   // bracket + no space = false positive
+    #   value < double(42)         // bracket + space = true positive
+    matched_new_or_template = match.group(1)
+
+    # Avoid arrays by looking for brackets that come after the closing
+    # parenthesis.
+    if Match(r'\([^()]+\)\s*\[', match.group(3)):
+      return
+
+    # Other things to ignore:
+    # - Function pointers
+    # - Casts to pointer types
+    # - Placement new
+    # - Alias declarations
+    matched_funcptr = match.group(3)
+    if (matched_new_or_template is None and
+        not (matched_funcptr and
+             (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                    matched_funcptr) or
+              matched_funcptr.startswith('(*)'))) and
+        not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
+        not Search(r'new\(\S+\)\s*' + matched_type, line)):
+      error(filename, linenum, 'readability/casting', 4,
+            'Using deprecated casting style.  '
+            'Use static_cast<%s>(...) instead' %
+            matched_type)
+
+  if not expecting_function:
+    CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
+                    r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
+                     r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
+                    r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  #
+  # Some non-identifier character is required before the '&' for the
+  # expression to be recognized as a cast.  These are casts:
+  #   expression = &static_cast<int*>(temporary());
+  #   function(&(int*)(temporary()));
+  #
+  # This is not a cast:
+  #   reference_type&(int* function_param);
+  match = Search(
+      r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
+      r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match:
+    # Try a better error message when the & is bound to something
+    # dereferenced by the casted pointer, as opposed to the casted
+    # pointer itself.
+    parenthesis_error = False
+    match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line)
+    if match:
+      _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1)))
+      if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
+        _, y2, x2 = CloseExpression(clean_lines, y1, x1)
+        if x2 >= 0:
+          extended_line = clean_lines.elided[y2][x2:]
+          if y2 < clean_lines.NumLines() - 1:
+            extended_line += clean_lines.elided[y2 + 1]
+          if Match(r'\s*(?:->|\[)', extended_line):
+            parenthesis_error = True
+
+    if parenthesis_error:
+      error(filename, linenum, 'readability/casting', 4,
+            ('Are you taking an address of something dereferenced '
+             'from a cast?  Wrapping the dereferenced expression in '
+             'parentheses will make the binding more obvious'))
+    else:
+      error(filename, linenum, 'runtime/casting', 4,
+            ('Are you taking an address of a cast?  '
+             'This is dangerous: could be a temp var.  '
+             'Take the address before doing the cast, rather than after'))
+
+
+def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
   """Checks for a C-style cast by looking for the pattern.
 
   Args:
     filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    line: The line of code to check.
-    raw_line: The raw line of code to check, with comments.
     cast_type: The string for the C++ cast to recommend.  This is either
       reinterpret_cast, static_cast, or const_cast, depending.
     pattern: The regular expression used to find C-style casts.
@@ -4151,75 +5284,34 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
     True if an error was emitted.
     False otherwise.
   """
+  line = clean_lines.elided[linenum]
   match = Search(pattern, line)
   if not match:
     return False
 
-  # e.g., sizeof(int)
-  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
-  if sizeof_match:
-    error(filename, linenum, 'runtime/sizeof', 1,
-          'Using sizeof(type).  Use sizeof(varname) instead if possible')
-    return True
+  # Exclude lines with keywords that tend to look like casts
+  context = line[0:match.start(1) - 1]
+  if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
+    return False
+
+  # Try expanding current context to see if we one level of
+  # parentheses inside a macro.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 5), -1):
+      context = clean_lines.elided[i] + context
+  if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
+    return False
 
   # operator++(int) and operator--(int)
-  if (line[0:match.start(1) - 1].endswith(' operator++') or
-      line[0:match.start(1) - 1].endswith(' operator--')):
+  if context.endswith(' operator++') or context.endswith(' operator--'):
     return False
 
-  # A single unnamed argument for a function tends to look like old
-  # style cast.  If we see those, don't issue warnings for deprecated
-  # casts, instead issue warnings for unnamed arguments where
-  # appropriate.
-  #
-  # These are things that we want warnings for, since the style guide
-  # explicitly require all parameters to be named:
-  #   Function(int);
-  #   Function(int) {
-  #   ConstMember(int) const;
-  #   ConstMember(int) const {
-  #   ExceptionMember(int) throw (...);
-  #   ExceptionMember(int) throw (...) {
-  #   PureVirtual(int) = 0;
-  #
-  # These are functions of some sort, where the compiler would be fine
-  # if they had named parameters, but people often omit those
-  # identifiers to reduce clutter:
-  #   (FunctionPointer)(int);
-  #   (FunctionPointer)(int) = value;
-  #   Function((function_pointer_arg)(int))
-  #   <TemplateArgument(int)>;
-  #   <(FunctionPointerTemplateArgument)(int)>;
+  # A single unnamed argument for a function tends to look like old style cast.
+  # If we see those, don't issue warnings for deprecated casts.
   remainder = line[match.end(0):]
-  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
-    # Looks like an unnamed parameter.
-
-    # Don't warn on any kind of template arguments.
-    if Match(r'^\s*>', remainder):
-      return False
-
-    # Don't warn on assignments to function pointers, but keep warnings for
-    # unnamed parameters to pure virtual functions.  Note that this pattern
-    # will also pass on assignments of "0" to function pointers, but the
-    # preferred values for those would be "nullptr" or "NULL".
-    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-    if matched_zero and matched_zero.group(1) != '0':
-      return False
-
-    # Don't warn on function pointer declarations.  For this we need
-    # to check what came before the "(type)" string.
-    if Match(r'.*\)\s*$', line[0:match.start(0)]):
-      return False
-
-    # Don't warn if the parameter is named with block comments, e.g.:
-    #  Function(int /*unused_param*/);
-    if '/*' in raw_line:
-      return False
-
-    # Passed all filters, issue warning here.
-    error(filename, linenum, 'readability/function', 3,
-          'All parameters should be named in a function')
-    return True
+  if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
+           remainder):
+    return False
 
   # At this point, all that should be left is actual casts.
   error(filename, linenum, 'readability/casting', 4,
@@ -4229,6 +5321,28 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
   return True
 
 
+def ExpectingFunctionArgs(clean_lines, linenum):
+  """Checks whether where function type arguments are expected.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+
+  Returns:
+    True if the line at 'linenum' is inside something that expects arguments
+    of function types.
+  """
+  line = clean_lines.elided[linenum]
+  return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+          (linenum >= 2 and
+           (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                  clean_lines.elided[linenum - 1]) or
+            Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                  clean_lines.elided[linenum - 2]) or
+            Search(r'\bstd::m?function\s*\<\s*$',
+                   clean_lines.elided[linenum - 1]))))
+
+
 _HEADERS_CONTAINING_TEMPLATES = (
     ('<deque>', ('deque',)),
     ('<functional>', ('unary_function', 'binary_function',
@@ -4251,11 +5365,15 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
     ('<limits>', ('numeric_limits',)),
     ('<list>', ('list',)),
     ('<map>', ('map', 'multimap',)),
-    ('<memory>', ('allocator',)),
+    ('<memory>', ('allocator', 'make_shared', 'make_unique', 'shared_ptr',
+                  'unique_ptr', 'weak_ptr')),
     ('<queue>', ('queue', 'priority_queue',)),
     ('<set>', ('set', 'multiset',)),
     ('<stack>', ('stack',)),
     ('<string>', ('char_traits', 'basic_string',)),
+    ('<tuple>', ('tuple',)),
+    ('<unordered_map>', ('unordered_map', 'unordered_multimap')),
+    ('<unordered_set>', ('unordered_set', 'unordered_multiset')),
     ('<utility>', ('pair',)),
     ('<vector>', ('vector',)),
 
@@ -4266,18 +5384,26 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
     ('<slist>', ('slist',)),
     )
 
-_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+_HEADERS_MAYBE_TEMPLATES = (
+    ('<algorithm>', ('copy', 'max', 'min', 'min_element', 'sort',
+                     'transform',
+                    )),
+    ('<utility>', ('forward', 'make_pair', 'move', 'swap')),
+    )
 
-_re_pattern_algorithm_header = []
-for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
-                  'transform'):
-  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-  # type::max().
-  _re_pattern_algorithm_header.append(
-      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
-       _template,
-       '<algorithm>'))
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
 
+_re_pattern_headers_maybe_templates = []
+for _header, _templates in _HEADERS_MAYBE_TEMPLATES:
+  for _template in _templates:
+    # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+    # type::max().
+    _re_pattern_headers_maybe_templates.append(
+        (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+            _template,
+            _header))
+
+# Other scripts may reach in and modify this pattern.
 _re_pattern_templates = []
 for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
   for _template in _templates:
@@ -4317,13 +5443,13 @@ def FilesBelongToSameModule(filename_cc, filename_h):
     string: the additional prefix needed to open the header file.
   """
 
-  if not filename_cc.endswith('.cc'):
+  fileinfo = FileInfo(filename_cc)
+  if not fileinfo.IsSource():
     return (False, '')
-  filename_cc = filename_cc[:-len('.cc')]
-  if filename_cc.endswith('_unittest'):
-    filename_cc = filename_cc[:-len('_unittest')]
-  elif filename_cc.endswith('_test'):
-    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc[:-len(fileinfo.Extension())]
+  matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo.BaseName())
+  if matched_test_suffix:
+    filename_cc = filename_cc[:-len(matched_test_suffix.group(1))]
   filename_cc = filename_cc.replace('/public/', '/')
   filename_cc = filename_cc.replace('/internal/', '/')
 
@@ -4342,16 +5468,16 @@ def FilesBelongToSameModule(filename_cc, filename_h):
   return files_belong_to_same_module, common_path
 
 
-def UpdateIncludeState(filename, include_state, io=codecs):
-  """Fill up the include_state with new includes found from the file.
+def UpdateIncludeState(filename, include_dict, io=codecs):
+  """Fill up the include_dict with new includes found from the file.
 
   Args:
     filename: the name of the header to read.
-    include_state: an _IncludeState instance in which the headers are inserted.
+    include_dict: a dictionary in which the headers are inserted.
     io: The io factory to use to read the file. Provided for testability.
 
   Returns:
-    True if a header was succesfully added. False otherwise.
+    True if a header was successfully added. False otherwise.
   """
   headerfile = None
   try:
@@ -4365,9 +5491,7 @@ def UpdateIncludeState(filename, include_state, io=codecs):
     match = _RE_PATTERN_INCLUDE.search(clean_line)
     if match:
       include = match.group(2)
-      # The value formatting is cute, but not really used right now.
-      # What matters here is that the key is in include_state.
-      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+      include_dict.setdefault(include, linenum)
   return True
 
 
@@ -4406,7 +5530,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
       if prefix.endswith('std::') or not prefix.endswith('::'):
         required['<string>'] = (linenum, 'string')
 
-    for pattern, template, header in _re_pattern_algorithm_header:
+    for pattern, template, header in _re_pattern_headers_maybe_templates:
       if pattern.search(line):
         required[header] = (linenum, template)
 
@@ -4415,15 +5539,21 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
       continue
 
     for pattern, template, header in _re_pattern_templates:
-      if pattern.search(line):
-        required[header] = (linenum, template)
+      matched = pattern.search(line)
+      if matched:
+        # Don't warn about IWYU in non-STL namespaces:
+        # (We check only the first match per line; good enough.)
+        prefix = line[:matched.start()]
+        if prefix.endswith('std::') or not prefix.endswith('::'):
+          required[header] = (linenum, template)
 
   # The policy is that if you #include something in foo.h you don't need to
   # include it again in foo.cc. Here, we will look at possible includes.
-  # Let's copy the include_state so it is only messed up within this function.
-  include_state = include_state.copy()
+  # Let's flatten the include_state include_list and copy it into a dictionary.
+  include_dict = dict([item for sublist in include_state.include_list
+                       for item in sublist])
 
-  # Did we find the header for this file (if any) and succesfully load it?
+  # Did we find the header for this file (if any) and successfully load it?
   header_found = False
 
   # Use the absolute path so that matching works properly.
@@ -4438,13 +5568,13 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # instead of 'foo_flymake.h'
   abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
 
-  # include_state is modified during iteration, so we iterate over a copy of
+  # include_dict is modified during iteration, so we iterate over a copy of
   # the keys.
-  header_keys = include_state.keys()
+  header_keys = include_dict.keys()
   for header in header_keys:
     (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
     fullpath = common_path + header
-    if same_module and UpdateIncludeState(fullpath, include_state, io):
+    if same_module and UpdateIncludeState(fullpath, include_dict, io):
       header_found = True
 
   # If we can't find the header file for a .cc, assume it's because we don't
@@ -4458,7 +5588,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # All the lines have been processed, report the errors found.
   for required_header_unstripped in required:
     template = required[required_header_unstripped][1]
-    if required_header_unstripped.strip('<>"') not in include_state:
+    if required_header_unstripped.strip('<>"') not in include_dict:
       error(filename, required[required_header_unstripped][0],
             'build/include_what_you_use', 4,
             'Add #include ' + required_header_unstripped + ' for ' + template)
@@ -4470,7 +5600,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
 def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
   """Check that make_pair's template arguments are deduced.
 
-  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
   specified explicitly, and such use isn't intended in any case.
 
   Args:
@@ -4488,6 +5618,165 @@ def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
           ' OR use pair directly OR if appropriate, construct a pair directly')
 
 
+def CheckRedundantVirtual(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "virtual" function-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for "virtual" on current line.
+  line = clean_lines.elided[linenum]
+  virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
+  if not virtual: return
+
+  # Ignore "virtual" keywords that are near access-specifiers.  These
+  # are only used in class base-specifier and do not apply to member
+  # functions.
+  if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
+      Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
+    return
+
+  # Ignore the "virtual" keyword from virtual base classes.  Usually
+  # there is a column on the same line in these cases (virtual base
+  # classes are rare in google3 because multiple inheritance is rare).
+  if Match(r'^.*[^:]:[^:].*$', line): return
+
+  # Look for the next opening parenthesis.  This is the start of the
+  # parameter list (possibly on the next line shortly after virtual).
+  # TODO(unknown): doesn't work if there are virtual functions with
+  # decltype() or other things that use parentheses, but csearch suggests
+  # that this is rare.
+  end_col = -1
+  end_line = -1
+  start_col = len(virtual.group(2))
+  for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[start_line][start_col:]
+    parameter_list = Match(r'^([^(]*)\(', line)
+    if parameter_list:
+      # Match parentheses to find the end of the parameter list
+      (_, end_line, end_col) = CloseExpression(
+          clean_lines, start_line, start_col + len(parameter_list.group(1)))
+      break
+    start_col = 0
+
+  if end_col < 0:
+    return  # Couldn't find end of parameter list, give up
+
+  # Look for "override" or "final" after the parameter list
+  # (possibly on the next few lines).
+  for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[i][end_col:]
+    match = Search(r'\b(override|final)\b', line)
+    if match:
+      error(filename, linenum, 'readability/inheritance', 4,
+            ('"virtual" is redundant since function is '
+             'already declared as "%s"' % match.group(1)))
+
+    # Set end_col to check whole lines after we are done with the
+    # first line.
+    end_col = 0
+    if Search(r'[^\w]\s*$', line):
+      break
+
+
+def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "override" or "final" virt-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for closing parenthesis nearby.  We need one to confirm where
+  # the declarator ends and where the virt-specifier starts to avoid
+  # false positives.
+  line = clean_lines.elided[linenum]
+  declarator_end = line.rfind(')')
+  if declarator_end >= 0:
+    fragment = line[declarator_end:]
+  else:
+    if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
+      fragment = line
+    else:
+      return
+
+  # Check that at most one of "override" or "final" is present, not both
+  if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
+    error(filename, linenum, 'readability/inheritance', 4,
+          ('"override" is redundant since function is '
+           'already declared as "final"'))
+
+
+
+
+# Returns true if we are at a new block, and it is directly
+# inside of a namespace.
+def IsBlockInNameSpace(nesting_state, is_forward_declaration):
+  """Checks that the new block is directly in a namespace.
+
+  Args:
+    nesting_state: The _NestingState object that contains info about our state.
+    is_forward_declaration: If the class is a forward declared class.
+  Returns:
+    Whether or not the new block is directly in a namespace.
+  """
+  if is_forward_declaration:
+    if len(nesting_state.stack) >= 1 and (
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+      return True
+    else:
+      return False
+
+  return (len(nesting_state.stack) > 1 and
+          nesting_state.stack[-1].check_namespace_indentation and
+          isinstance(nesting_state.stack[-2], _NamespaceInfo))
+
+
+def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                    raw_lines_no_comments, linenum):
+  """This method determines if we should apply our namespace indentation check.
+
+  Args:
+    nesting_state: The current nesting state.
+    is_namespace_indent_item: If we just put a new class on the stack, True.
+      If the top of the stack is not a class, or we did not recently
+      add the class, False.
+    raw_lines_no_comments: The lines without the comments.
+    linenum: The current line number we are processing.
+
+  Returns:
+    True if we should apply our namespace indentation check. Currently, it
+    only works for classes and namespaces inside of a namespace.
+  """
+
+  is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
+                                                     linenum)
+
+  if not (is_namespace_indent_item or is_forward_declaration):
+    return False
+
+  # If we are in a macro, we do not want to check the namespace indentation.
+  if IsMacroDefinition(raw_lines_no_comments, linenum):
+    return False
+
+  return IsBlockInNameSpace(nesting_state, is_forward_declaration)
+
+
+# Call this method if the line is directly inside of a namespace.
+# If the line above is blank (excluding comments) or the start of
+# an inner namespace, it cannot be indented.
+def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
+                                    error):
+  line = raw_lines_no_comments[linenum]
+  if Match(r'^\s+', line):
+    error(filename, linenum, 'runtime/indentation_namespace', 4,
+          'Do not indent within a namespace')
+
+
 def ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions=[]):
@@ -4501,7 +5790,7 @@ def ProcessLine(filename, file_extension, clean_lines, line,
     line: Number of line being processed.
     include_state: An _IncludeState instance in which the headers are inserted.
     function_state: A _FunctionState instance which counts function lines, etc.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -4512,8 +5801,9 @@ def ProcessLine(filename, file_extension, clean_lines, line,
   raw_lines = clean_lines.raw_lines
   ParseNolintSuppressions(filename, raw_lines[line], line, error)
   nesting_state.Update(filename, clean_lines, line, error)
-  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
-    return
+  CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                               error)
+  if nesting_state.InAsmBlock(): return
   CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
   CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
   CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
@@ -4526,9 +5816,82 @@ def ProcessLine(filename, file_extension, clean_lines, line,
   CheckPosixThreading(filename, clean_lines, line, error)
   CheckInvalidIncrement(filename, clean_lines, line, error)
   CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  CheckRedundantVirtual(filename, clean_lines, line, error)
+  CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
   for check_fn in extra_check_functions:
     check_fn(filename, clean_lines, line, error)
 
+def FlagCxx11Features(filename, clean_lines, linenum, error):
+  """Flag those c++11 features that we only allow in certain places.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++ TR1 headers.
+  if include and include.group(1).startswith('tr1/'):
+    error(filename, linenum, 'build/c++tr1', 5,
+          ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1))
+
+  # Flag unapproved C++11 headers.
+  if include and include.group(1) in ('cfenv',
+                                      'condition_variable',
+                                      'fenv.h',
+                                      'future',
+                                      'mutex',
+                                      'thread',
+                                      'chrono',
+                                      'ratio',
+                                      'regex',
+                                      'system_error',
+                                     ):
+    error(filename, linenum, 'build/c++11', 5,
+          ('<%s> is an unapproved C++11 header.') % include.group(1))
+
+  # The only place where we need to worry about C++11 keywords and library
+  # features in preprocessor directives is in macro definitions.
+  if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
+
+  # These are classes and free functions.  The classes are always
+  # mentioned as std::*, but we only catch the free functions if
+  # they're not found by ADL.  They're alphabetical by header.
+  for top_name in (
+      # type_traits
+      'alignment_of',
+      'aligned_union',
+      ):
+    if Search(r'\bstd::%s\b' % top_name, line):
+      error(filename, linenum, 'build/c++11', 5,
+            ('std::%s is an unapproved C++11 class or function.  Send c-style '
+             'an example of where it would make your code more readable, and '
+             'they may let you use it.') % top_name)
+
+
+def FlagCxx14Features(filename, clean_lines, linenum, error):
+  """Flag those C++14 features that we restrict.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++14 headers.
+  if include and include.group(1) in ('scoped_allocator', 'shared_mutex'):
+    error(filename, linenum, 'build/c++14', 5,
+          ('<%s> is an unapproved C++14 header.') % include.group(1))
+
+
 def ProcessFileData(filename, file_extension, lines, error,
                     extra_check_functions=[]):
   """Performs lint checks and reports any errors to the given error function.
@@ -4549,31 +5912,122 @@ def ProcessFileData(filename, file_extension, lines, error,
 
   include_state = _IncludeState()
   function_state = _FunctionState()
-  nesting_state = _NestingState()
+  nesting_state = NestingState()
 
   ResetNolintSuppressions()
 
   CheckForCopyright(filename, lines, error)
-
-  if file_extension == 'h':
-    CheckForHeaderGuard(filename, lines, error)
-
+  ProcessGlobalSuppresions(lines)
   RemoveMultiLineComments(filename, lines, error)
   clean_lines = CleansedLines(lines)
+
+  if IsHeaderExtension(file_extension):
+    CheckForHeaderGuard(filename, clean_lines, error)
+
   for line in xrange(clean_lines.NumLines()):
     ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions)
+    FlagCxx11Features(filename, clean_lines, line, error)
   nesting_state.CheckCompletedBlocks(filename, error)
 
   CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
 
+  # Check that the .cc file has included its header if it exists.
+  if _IsSourceExtension(file_extension):
+    CheckHeaderFileIncluded(filename, include_state, error)
+
   # We check here rather than inside ProcessLine so that we see raw
   # lines rather than "cleaned" lines.
   CheckForBadCharacters(filename, lines, error)
 
   CheckForNewlineAtEOF(filename, lines, error)
 
+def ProcessConfigOverrides(filename):
+  """ Loads the configuration files and processes the config overrides.
+
+  Args:
+    filename: The name of the file being processed by the linter.
+
+  Returns:
+    False if the current |filename| should not be processed further.
+  """
+
+  abs_filename = os.path.abspath(filename)
+  cfg_filters = []
+  keep_looking = True
+  while keep_looking:
+    abs_path, base_name = os.path.split(abs_filename)
+    if not base_name:
+      break  # Reached the root directory.
+
+    cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
+    abs_filename = abs_path
+    if not os.path.isfile(cfg_file):
+      continue
+
+    try:
+      with open(cfg_file) as file_handle:
+        for line in file_handle:
+          line, _, _ = line.partition('#')  # Remove comments.
+          if not line.strip():
+            continue
+
+          name, _, val = line.partition('=')
+          name = name.strip()
+          val = val.strip()
+          if name == 'set noparent':
+            keep_looking = False
+          elif name == 'filter':
+            cfg_filters.append(val)
+          elif name == 'exclude_files':
+            # When matching exclude_files pattern, use the base_name of
+            # the current file name or the directory name we are processing.
+            # For example, if we are checking for lint errors in /foo/bar/baz.cc
+            # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
+            # file's "exclude_files" filter is meant to be checked against "bar"
+            # and not "baz" nor "bar/baz.cc".
+            if base_name:
+              pattern = re.compile(val)
+              if pattern.match(base_name):
+                if _cpplint_state.quiet:
+                  # Suppress "Ignoring file" warning when using --quiet.
+                  return False
+                sys.stderr.write('Ignoring "%s": file excluded by "%s". '
+                                 'File path component "%s" matches '
+                                 'pattern "%s"\n' %
+                                 (filename, cfg_file, base_name, val))
+                return False
+          elif name == 'linelength':
+            global _line_length
+            try:
+                _line_length = int(val)
+            except ValueError:
+                sys.stderr.write('Line length must be numeric.')
+          elif name == 'root':
+            global _root
+            # root directories are specified relative to CPPLINT.cfg dir.
+            _root = os.path.join(os.path.dirname(cfg_file), val)
+          elif name == 'headers':
+            ProcessHppHeadersOption(val)
+          else:
+            sys.stderr.write(
+                'Invalid configuration option (%s) in file %s\n' %
+                (name, cfg_file))
+
+    except IOError:
+      sys.stderr.write(
+          "Skipping config file '%s': Can't open for reading\n" % cfg_file)
+      keep_looking = False
+
+  # Apply all the accumulated filters in reverse order (top-level directory
+  # config options having the least priority).
+  for filter in reversed(cfg_filters):
+     _AddFilters(filter)
+
+  return True
+
+
 def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """Does google-lint on a single file.
 
@@ -4589,7 +6043,15 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """
 
   _SetVerboseLevel(vlevel)
+  _BackupFilters()
+  old_errors = _cpplint_state.error_count
+
+  if not ProcessConfigOverrides(filename):
+    _RestoreFilters()
+    return
 
+  lf_lines = []
+  crlf_lines = []
   try:
     # Support the UNIX convention of using "-" for stdin.  Note that
     # we are not opening the file with universal newline support
@@ -4597,10 +6059,7 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
     # contain trailing '\r' characters if we are reading a file that
     # has CRLF endings.
     # If after the split a trailing '\r' is present, it is removed
-    # below. If it is not expected to be present (i.e. os.linesep !=
-    # '\r\n' as in Windows), a warning is issued below if this file
-    # is processed.
-
+    # below.
     if filename == '-':
       lines = codecs.StreamReaderWriter(sys.stdin,
                                         codecs.getreader('utf8'),
@@ -4609,16 +6068,19 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
     else:
       lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
 
-    carriage_return_found = False
     # Remove trailing '\r'.
-    for linenum in range(len(lines)):
+    # The -1 accounts for the extra trailing blank line we get from split()
+    for linenum in range(len(lines) - 1):
       if lines[linenum].endswith('\r'):
         lines[linenum] = lines[linenum].rstrip('\r')
-        carriage_return_found = True
+        crlf_lines.append(linenum + 1)
+      else:
+        lf_lines.append(linenum + 1)
 
   except IOError:
     sys.stderr.write(
         "Skipping input '%s': Can't open for reading\n" % filename)
+    _RestoreFilters()
     return
 
   # Note, if no dot is found, this will give the entire filename as the ext.
@@ -4632,14 +6094,30 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
   else:
     ProcessFileData(filename, file_extension, lines, Error,
                     extra_check_functions)
-    if carriage_return_found and os.linesep != '\r\n':
-      # Use 0 for linenum since outputting only one error for potentially
-      # several lines.
-      Error(filename, 0, 'whitespace/newline', 1,
-            'One or more unexpected \\r (^M) found;'
-            'better to use only a \\n')
 
-  sys.stderr.write('Done processing %s\n' % filename)
+    # If end-of-line sequences are a mix of LF and CR-LF, issue
+    # warnings on the lines with CR.
+    #
+    # Don't issue any warnings if all lines are uniformly LF or CR-LF,
+    # since critique can handle these just fine, and the style guide
+    # doesn't dictate a particular end of line sequence.
+    #
+    # We can't depend on os.linesep to determine what the desired
+    # end-of-line sequence should be, since that will return the
+    # server-side end-of-line sequence.
+    if lf_lines and crlf_lines:
+      # Warn on every line with CR.  An alternative approach might be to
+      # check whether the file is mostly CRLF or just LF, and warn on the
+      # minority, we bias toward LF here since most tools prefer LF.
+      for linenum in crlf_lines:
+        Error(filename, linenum, 'whitespace/newline', 1,
+              'Unexpected \\r (^M) found; better to use only \\n')
+
+  # Suppress printing anything if --quiet was passed unless the error
+  # count has increased after processing this file.
+  if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count:
+    sys.stdout.write('Done processing %s\n' % filename)
+  _RestoreFilters()
 
 
 def PrintUsage(message):
@@ -4681,13 +6159,16 @@ def ParseArguments(args):
                                                  'filter=',
                                                  'root=',
                                                  'linelength=',
-                                                 'extensions='])
+                                                 'extensions=',
+                                                 'headers=',
+                                                 'quiet'])
   except getopt.GetoptError:
     PrintUsage('Invalid arguments.')
 
   verbosity = _VerboseLevel()
   output_format = _OutputFormat()
   filters = ''
+  quiet = _Quiet()
   counting_style = ''
 
   for (opt, val) in opts:
@@ -4697,6 +6178,8 @@ def ParseArguments(args):
       if val not in ('emacs', 'vs7', 'eclipse'):
         PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
       output_format = val
+    elif opt == '--quiet':
+      quiet = True
     elif opt == '--verbose':
       verbosity = int(val)
     elif opt == '--filter':
@@ -4721,12 +6204,15 @@ def ParseArguments(args):
       try:
           _valid_extensions = set(val.split(','))
       except ValueError:
-          PrintUsage('Extensions must be comma seperated list.')
+          PrintUsage('Extensions must be comma separated list.')
+    elif opt == '--headers':
+      ProcessHppHeadersOption(val)
 
   if not filenames:
     PrintUsage('No files were specified.')
 
   _SetOutputFormat(output_format)
+  _SetQuiet(quiet)
   _SetVerboseLevel(verbosity)
   _SetFilters(filters)
   _SetCountingStyle(counting_style)
@@ -4747,7 +6233,9 @@ def main():
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:
     ProcessFile(filename, _cpplint_state.verbose_level)
-  _cpplint_state.PrintErrorCounts()
+  # If --quiet is passed, suppress printing error count unless there are errors.
+  if not _cpplint_state.quiet or _cpplint_state.error_count > 0:
+    _cpplint_state.PrintErrorCounts()
 
   sys.exit(_cpplint_state.error_count > 0)
 

From ca93fc740edcc84da5c61f0951b86cf98c0c4752 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 27 Oct 2021 10:05:19 -0700
Subject: [PATCH 163/926] update libwebm to libwebm-1.0.0.28-20-g206d268

picks up Android.mk license updates from AOSP and fixes as part of the
1.0.0.28 release

changelog:
https://chromium.googlesource.com/webm/libwebm/+log/37d9b86..206d268

Change-Id: I18d5238f7d1aff2678d903018929da952410fa0e
---
 third_party/libwebm/Android.mk               |  3 +
 third_party/libwebm/README.libvpx            |  2 +-
 third_party/libwebm/mkvmuxer/mkvmuxer.cc     | 30 ++++---
 third_party/libwebm/mkvmuxer/mkvmuxerutil.cc |  4 +-
 third_party/libwebm/mkvparser/mkvparser.cc   | 87 +++++++++++++-------
 5 files changed, 84 insertions(+), 42 deletions(-)

diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk
index 3b3dd1d390..23f935f2db 100644
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@@ -14,4 +14,7 @@ LOCAL_SRC_FILES:= common/file_util.cc \
                   mkvmuxer/mkvmuxer.cc \
                   mkvmuxer/mkvmuxerutil.cc \
                   mkvmuxer/mkvwriter.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT
 include $(BUILD_STATIC_LIBRARY)
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 1e87afd3d1..5cc0a83701 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 37d9b860ebbf40cb0f6dcb7a6fef452d798062da
+Version: 206d268d4d8066e5a37c49025325b80c95c771dd
 License: BSD
 License File: LICENSE.txt
 
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 5120312119..24c288863f 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -774,7 +774,7 @@ bool Track::Write(IMkvWriter* writer) const {
     return false;
 
   // AV1 tracks require a CodecPrivate. See
-  // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md
+  // https://github.com/ietf-wg-cellar/matroska-specification/blob/HEAD/codec/av1.md
   // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
   // point to a stable version once it is finalized, or our own WebM mappings
   // page on webmproject.org should we decide to release them.
@@ -2622,7 +2622,8 @@ bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) {
 
 uint64_t Cluster::Size() const {
   const uint64_t element_size =
-      EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) +
+      EbmlMasterElementSize(static_cast<uint64_t>(libwebm::kMkvCluster),
+                            uint64_t{0xFFFFFFFFFFFFFFFFU}) +
       payload_size_;
   return element_size;
 }
@@ -3084,6 +3085,7 @@ Segment::Segment()
       accurate_cluster_duration_(false),
       fixed_size_cluster_timecode_(false),
       estimate_file_duration_(false),
+      ebml_header_size_(0),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
@@ -4105,12 +4107,16 @@ int Segment::WriteFramesAll() {
     // places where |doc_type_version_| needs to be updated.
     if (frame->discard_padding() != 0)
       doc_type_version_ = 4;
-    if (!cluster->AddFrame(frame))
-      return -1;
+    if (!cluster->AddFrame(frame)) {
+      delete frame;
+      continue;
+    }
 
     if (new_cuepoint_ && cues_track_ == frame->track_number()) {
-      if (!AddCuePoint(frame->timestamp(), cues_track_))
-        return -1;
+      if (!AddCuePoint(frame->timestamp(), cues_track_)) {
+        delete frame;
+        continue;
+      }
     }
 
     if (frame->timestamp() > last_timestamp_) {
@@ -4153,12 +4159,16 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) {
       const Frame* const frame_prev = frames_[i - 1];
       if (frame_prev->discard_padding() != 0)
         doc_type_version_ = 4;
-      if (!cluster->AddFrame(frame_prev))
-        return false;
+      if (!cluster->AddFrame(frame_prev)) {
+        delete frame_prev;
+        continue;
+      }
 
       if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
-        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
-          return false;
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) {
+          delete frame_prev;
+          continue;
+        }
       }
 
       ++shift_left;
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 6436817c9b..bd2f769138 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -606,8 +606,8 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
 
 void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
-  *minor = 2;
-  *build = 1;
+  *minor = 3;
+  *build = 0;
   *revision = 0;
 }
 
diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc
index ace65bd595..de8884b381 100644
--- a/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -54,9 +54,9 @@ Type* SafeArrayAlloc(unsigned long long num_elements,
 
 void GetVersion(int& major, int& minor, int& build, int& revision) {
   major = 1;
-  minor = 0;
+  minor = 1;
   build = 0;
-  revision = 30;
+  revision = 0;
 }
 
 long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
@@ -1502,8 +1502,8 @@ long SeekHead::Parse() {
 
   // first count the seek head entries
 
-  int entry_count = 0;
-  int void_element_count = 0;
+  long long entry_count = 0;
+  long long void_element_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -1513,10 +1513,15 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvSeek)
+    if (id == libwebm::kMkvSeek) {
       ++entry_count;
-    else if (id == libwebm::kMkvVoid)
+      if (entry_count > INT_MAX)
+        return E_PARSE_FAILED;
+    } else if (id == libwebm::kMkvVoid) {
       ++void_element_count;
+      if (void_element_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
 
@@ -1528,14 +1533,15 @@ long SeekHead::Parse() {
     return E_FILE_FORMAT_INVALID;
 
   if (entry_count > 0) {
-    m_entries = new (std::nothrow) Entry[entry_count];
+    m_entries = new (std::nothrow) Entry[static_cast<size_t>(entry_count)];
 
     if (m_entries == NULL)
       return -1;
   }
 
   if (void_element_count > 0) {
-    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+    m_void_elements =
+        new (std::nothrow) VoidElement[static_cast<size_t>(void_element_count)];
 
     if (m_void_elements == NULL)
       return -1;
@@ -1582,13 +1588,13 @@ long SeekHead::Parse() {
 
   ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
   assert(count_ >= 0);
-  assert(count_ <= entry_count);
+  assert(static_cast<long long>(count_) <= entry_count);
 
   m_entry_count = static_cast<int>(count_);
 
   count_ = ptrdiff_t(pVoidElement - m_void_elements);
   assert(count_ >= 0);
-  assert(count_ <= void_element_count);
+  assert(static_cast<long long>(count_) <= void_element_count);
 
   m_void_element_count = static_cast<int>(count_);
 
@@ -2299,7 +2305,7 @@ bool CuePoint::Load(IMkvReader* pReader) {
   long long pos = pos_;
 
   // First count number of track positions
-
+  unsigned long long track_positions_count = 0;
   while (pos < stop) {
     long len;
 
@@ -2323,12 +2329,17 @@ bool CuePoint::Load(IMkvReader* pReader) {
     if (id == libwebm::kMkvCueTime)
       m_timecode = UnserializeUInt(pReader, pos, size);
 
-    else if (id == libwebm::kMkvCueTrackPositions)
-      ++m_track_positions_count;
+    else if (id == libwebm::kMkvCueTrackPositions) {
+      ++track_positions_count;
+      if (track_positions_count > UINT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
   }
 
+  m_track_positions_count = static_cast<size_t>(track_positions_count);
+
   if (m_timecode < 0 || m_track_positions_count <= 0) {
     return false;
   }
@@ -4194,8 +4205,8 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
   const long long stop = start + size;
 
   // Count ContentCompression and ContentEncryption elements.
-  int compression_count = 0;
-  int encryption_count = 0;
+  long long compression_count = 0;
+  long long encryption_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -4203,11 +4214,17 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvContentCompression)
+    if (id == libwebm::kMkvContentCompression) {
       ++compression_count;
+      if (compression_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
-    if (id == libwebm::kMkvContentEncryption)
+    if (id == libwebm::kMkvContentEncryption) {
       ++encryption_count;
+      if (encryption_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4218,16 +4235,16 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     return -1;
 
   if (compression_count > 0) {
-    compression_entries_ =
-        new (std::nothrow) ContentCompression*[compression_count];
+    compression_entries_ = new (std::nothrow)
+        ContentCompression*[static_cast<size_t>(compression_count)];
     if (!compression_entries_)
       return -1;
     compression_entries_end_ = compression_entries_;
   }
 
   if (encryption_count > 0) {
-    encryption_entries_ =
-        new (std::nothrow) ContentEncryption*[encryption_count];
+    encryption_entries_ = new (std::nothrow)
+        ContentEncryption*[static_cast<size_t>(encryption_count)];
     if (!encryption_entries_) {
       delete[] compression_entries_;
       compression_entries_ = NULL;
@@ -4918,7 +4935,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
   const long long stop = start + size;
 
   // Count ContentEncoding elements.
-  int count = 0;
+  long long count = 0;
   while (pos < stop) {
     long long id, size;
     const long status = ParseElementHeader(pReader, pos, stop, id, size);
@@ -4926,8 +4943,11 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
       return status;
 
     // pos now designates start of element
-    if (id == libwebm::kMkvContentEncoding)
+    if (id == libwebm::kMkvContentEncoding) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4937,7 +4957,8 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
   if (count <= 0)
     return -1;
 
-  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  content_encoding_entries_ =
+      new (std::nothrow) ContentEncoding*[static_cast<size_t>(count)];
   if (!content_encoding_entries_)
     return -1;
 
@@ -5229,6 +5250,8 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
 
       projection_ptr->type = static_cast<ProjectionType>(projection_type);
     } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      if (projection_ptr->private_data != NULL)
+        return false;
       unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
 
       if (data == NULL)
@@ -5286,6 +5309,7 @@ VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
       m_projection(NULL) {}
 
 VideoTrack::~VideoTrack() {
+  delete[] m_colour_space;
   delete m_colour;
   delete m_projection;
 }
@@ -5307,7 +5331,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   long long stereo_mode = 0;
 
   double rate = 0.0;
-  char* colour_space = NULL;
+  std::unique_ptr<char[]> colour_space_ptr;
 
   IMkvReader* const pReader = pSegment->m_pReader;
 
@@ -5384,9 +5408,11 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
         projection_ptr.reset(projection);
       }
     } else if (id == libwebm::kMkvColourSpace) {
+      char* colour_space = NULL;
       const long status = UnserializeString(pReader, pos, size, colour_space);
       if (status < 0)
         return status;
+      colour_space_ptr.reset(colour_space);
     }
 
     pos += size;  // consume payload
@@ -5418,7 +5444,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
   pTrack->m_colour = colour_ptr.release();
-  pTrack->m_colour_space = colour_space;
+  pTrack->m_colour_space = colour_space_ptr.release();
   pTrack->m_projection = projection_ptr.release();
 
   pResult = pTrack;
@@ -5648,7 +5674,7 @@ long Tracks::Parse() {
   const long long stop = m_start + m_size;
   IMkvReader* const pReader = m_pSegment->m_pReader;
 
-  int count = 0;
+  long long count = 0;
   long long pos = m_start;
 
   while (pos < stop) {
@@ -5662,8 +5688,11 @@ long Tracks::Parse() {
     if (size == 0)  // weird
       continue;
 
-    if (id == libwebm::kMkvTrackEntry)
+    if (id == libwebm::kMkvTrackEntry) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -5676,7 +5705,7 @@ long Tracks::Parse() {
   if (count <= 0)
     return 0;  // success
 
-  m_trackEntries = new (std::nothrow) Track*[count];
+  m_trackEntries = new (std::nothrow) Track*[static_cast<size_t>(count)];
 
   if (m_trackEntries == NULL)
     return -1;

From 69d08cb9d33af163ebef67d8c4dc45c897106905 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 6 Nov 2021 10:42:46 -0700
Subject: [PATCH 164/926] vp8_update_rate_correction_factors: fix integer
 overflow

the intermediate value in the correction_factor calculation may exceed
integer bounds

Bug: b/189602769
Change-Id: I75726b12f3095663911d78333f3ea26eb6dee21e
---
 vp8/encoder/ratectrl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 4b76cc6429..d591680ce3 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1079,8 +1079,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
 
   /* Work out a size correction factor. */
   if (projected_size_based_on_q > 0) {
-    correction_factor =
-        (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+    correction_factor = (int)((100 * (int64_t)cpi->projected_frame_size) /
+                              projected_size_based_on_q);
   }
 
   /* More heavily damped adjustment used if we have been oscillating

From 23796337ce5b0a2f58eb7386c9aded5e6a4b84f6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 8 Nov 2021 12:57:12 -0800
Subject: [PATCH 165/926] vp8,calc_pframe_target_size: fix integer overflow

this is similar to the fix for calc_iframe_target_size:
5f345a924 Avoid overflow in calc_iframe_target_size

Bug: chromium:1264506
Change-Id: I2f0e161cf9da59ca0724692d581f1594c8098ebb
---
 vp8/encoder/ratectrl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index d591680ce3..3df34009ab 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -781,6 +781,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
         }
       } else {
         int percent_high = 0;
+        int64_t target = cpi->this_frame_target;
 
         if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
             (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)) {
@@ -798,7 +799,9 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           percent_high = 0;
         }
 
-        cpi->this_frame_target += (cpi->this_frame_target * percent_high) / 200;
+        target += (target * percent_high) / 200;
+        target = VPXMIN(target, INT_MAX);
+        cpi->this_frame_target = (int)target;
 
         /* Are we allowing control of active_worst_allowed_q according
          * to buffer level.

From 40c21ff6fe32c12d7e0a7f66b0a2f7ca67a26695 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 6 Nov 2021 10:42:37 -0700
Subject: [PATCH 166/926] video_source.h,ReallocImage: quiet implicit conv
 warning

with -fsanitize=undefined

test/video_source.h:194:33: runtime error: implicit conversion from type
'int' of value -32 (32-bit, signed) to type 'unsigned int' changed the
value to 4294967264 (32-bit, unsigned)

Change-Id: I92013086d517fecf01c9e4cdfe6737b8ce733a1f
---
 test/video_source.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/video_source.h b/test/video_source.h
index e9340f21e9..2ba3f64211 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -191,7 +191,7 @@ class DummyVideoSource : public VideoSource {
   void ReallocImage() {
     vpx_img_free(img_);
     img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
-    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+    raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
   }
 
   vpx_image_t *img_;

From 1676cddaaa4feebc766c64767f035ca9b0e5739f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 6 Nov 2021 16:33:00 -0700
Subject: [PATCH 167/926] vp8: fix some implicit signed -> unsigned conv
 warnings

and vice-versa mostly when dealing with bitmasks

w/clang-11 -fsanitize=undefined

Change-Id: I6d8f676bf87679ba1dad9cb7f55eea172103d9d3
---
 vp8/decoder/decodeframe.c | 4 ++--
 vp8/encoder/encodeframe.c | 4 ++--
 vp8/encoder/encodemv.c    | 2 +-
 vp8/encoder/lookahead.c   | 4 ++--
 vp8/encoder/ratectrl.c    | 2 +-
 vpx_ports/x86.h           | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 67c254fa14..1c1566766b 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -872,8 +872,8 @@ static void init_frame(VP8D_COMP *pbi) {
   xd->mode_info_stride = pc->mode_info_stride;
   xd->corrupted = 0; /* init without corruption */
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (pc->full_pixel) xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = ~0;
+  if (pc->full_pixel) xd->fullpixel_mask = ~7;
 }
 
 int vp8_decode_frame(VP8D_COMP *pbi) {
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 2f84381d24..69271f1a73 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -634,8 +634,8 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi) {
                              cpi->prob_last_coded, cpi->prob_gf_coded);
   }
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (cm->full_pixel) xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = ~0;
+  if (cm->full_pixel) xd->fullpixel_mask = ~7;
 
   vp8_zero(x->coef_counts);
   vp8_zero(x->ymode_count);
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index 04adf105b9..ff38965393 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -160,7 +160,7 @@ static void calc_prob(vp8_prob *p, const unsigned int ct[2]) {
   const unsigned int tot = ct[0] + ct[1];
 
   if (tot) {
-    const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+    const vp8_prob x = ((ct[0] * 255) / tot) & ~1u;
     *p = x ? x : 1;
   }
 }
diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
index 37aa9eee84..49f851d019 100644
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -66,8 +66,8 @@ struct lookahead_ctx *vp8_lookahead_init(unsigned int width,
   depth += 1;
 
   /* Align the buffer dimensions */
-  width = (width + 15) & ~15;
-  height = (height + 15) & ~15;
+  width = (width + 15) & ~15u;
+  height = (height + 15) & ~15u;
 
   /* Allocate the lookahead structures */
   ctx = calloc(1, sizeof(*ctx));
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 4b76cc6429..0a346ccfbe 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -314,7 +314,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
      * bandwidth per second * fraction of the initial buffer
      * level
      */
-    target = cpi->oxcf.starting_buffer_level / 2;
+    target = (uint64_t)cpi->oxcf.starting_buffer_level / 2;
 
     if (target > cpi->oxcf.target_bandwidth * 3 / 2) {
       target = cpi->oxcf.target_bandwidth * 3 / 2;
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 4d5391b78d..651ff64606 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -391,7 +391,7 @@ static INLINE unsigned int x87_set_double_precision(void) {
   // Reserved                      01B
   // Double Precision (53-Bits)    10B
   // Extended Precision (64-Bits)  11B
-  x87_set_control_word((mode & ~0x300) | 0x200);
+  x87_set_control_word((mode & ~0x300u) | 0x200u);
   return mode;
 }
 

From 2e73da326a26ddc367317e860e24e274947c26d8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 6 Nov 2021 16:43:11 -0700
Subject: [PATCH 168/926] mem_sse2.h: storeu_uint32 -> storeu_int32

this changes the parameter to int32_t which matches the type with usage
of this call using _mm_cvtsi128_si32() as a parameter. quiets an
implicit conversion warning with clang-11 -fsanitize=undefined

Change-Id: I1e9e9ffac5d2996962d29611458311221eca8ea0
---
 vp8/common/x86/bilinear_filter_sse2.c |  4 ++--
 vpx_dsp/x86/loopfilter_sse2.c         | 16 ++++++++--------
 vpx_dsp/x86/mem_sse2.h                |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c
index 9bf65d8045..ff6cbbd68c 100644
--- a/vp8/common/x86/bilinear_filter_sse2.c
+++ b/vp8/common/x86/bilinear_filter_sse2.c
@@ -313,10 +313,10 @@ static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
       const __m128i compensated = _mm_add_epi16(sum, round_factor);
       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
       __m128i packed = _mm_packus_epi16(shifted, shifted);
-      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      storeu_int32(dst, _mm_cvtsi128_si32(packed));
       packed = _mm_srli_si128(packed, 4);
       dst += stride;
-      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      storeu_int32(dst, _mm_cvtsi128_si32(packed));
       dst += stride;
       src += 8;
     }
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index b6ff24834b..347c9fdbe9 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -211,21 +211,21 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
 
-  storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
 
-  storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
 }
 
 void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 258ab38e60..75fa2b0b72 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -16,7 +16,7 @@
 
 #include "./vpx_config.h"
 
-static INLINE void storeu_uint32(void *dst, uint32_t v) {
+static INLINE void storeu_int32(void *dst, int32_t v) {
   memcpy(dst, &v, sizeof(v));
 }
 

From 16333de2890ea84fff5c2d788003f4eea2ce5600 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 6 Nov 2021 16:48:13 -0700
Subject: [PATCH 169/926] mem_sse2.h: loadu_uint32 -> loadu_int32

this changes the return to int32_t which matches the type with usage
of this call as input to _mm_cvtsi32_si128(), _mm_set_epi32(), etc.
fixes implicit conversion warning with clang-11 -fsanitize=undefined

Change-Id: I1425f12d4f79155dd5d7af0eb00fbdb9f1940544
---
 vpx_dsp/x86/avg_pred_sse2.c | 6 +++---
 vpx_dsp/x86/mem_sse2.h      | 4 ++--
 vpx_dsp/x86/variance_sse2.c | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/vpx_dsp/x86/avg_pred_sse2.c b/vpx_dsp/x86/avg_pred_sse2.c
index e4e1e0e7a2..c6e70f744e 100644
--- a/vpx_dsp/x86/avg_pred_sse2.c
+++ b/vpx_dsp/x86/avg_pred_sse2.c
@@ -46,9 +46,9 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
         r = _mm_loadu_si128((const __m128i *)ref);
         ref += 16;
       } else if (width == 4) {
-        r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride),
-                          loadu_uint32(ref + 2 * ref_stride),
-                          loadu_uint32(ref + ref_stride), loadu_uint32(ref));
+        r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride),
+                          loadu_int32(ref + 2 * ref_stride),
+                          loadu_int32(ref + ref_stride), loadu_int32(ref));
 
         ref += 4 * ref_stride;
       } else {
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 75fa2b0b72..8b6d4d1dd4 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -20,8 +20,8 @@ static INLINE void storeu_int32(void *dst, int32_t v) {
   memcpy(dst, &v, sizeof(v));
 }
 
-static INLINE uint32_t loadu_uint32(const void *src) {
-  uint32_t v;
+static INLINE int32_t loadu_int32(const void *src) {
+  int32_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 37ef64ecaa..67645c57ac 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -36,8 +36,8 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
   const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
   return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
 }

From 888bafc78d8bddb5cfc4262c93f456c812763571 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 6 Nov 2021 10:42:46 -0700
Subject: [PATCH 170/926] vp8 encoder: fix some integer overflows

cap the bitrate to 1000Mbps to avoid many instances of bitrate * 3 / 2
overflowing.

this adds coverage for 2048x2048 in the default test for VP8 with TODOs
for issues at that resolution for VP9 and at max resolution for both.

Bug: b/189602769
Bug: chromium:1264506
Bug: webm:1748
Bug: webm:1749
Bug: webm:1750
Bug: webm:1751
Change-Id: Iedee4dd8d3609c2504271f94d22433dfcd828429
---
 test/realtime_test.cc | 43 ++++++++++++++++++++++++++++++++++++++-----
 vp8/vp8_cx_iface.c    |  4 +++-
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index 63a5347d99..b32a35513c 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <limits.h>
+
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
@@ -52,6 +54,22 @@ class RealtimeTest
     frame_packets_++;
   }
 
+  bool IsVP9() const {
+#if CONFIG_VP9_ENCODER
+    return codec_ == &libvpx_test::kVP9;
+#else
+    return false;
+#endif
+  }
+
+  void TestIntegerOverflow(unsigned int width, unsigned int height) {
+    ::libvpx_test::RandomVideoSource video;
+    video.SetSize(width, height);
+    video.set_limit(20);
+    cfg_.rc_target_bitrate = UINT_MAX;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
   int frame_packets_;
 };
 
@@ -64,11 +82,26 @@ TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
 }
 
 TEST_P(RealtimeTest, IntegerOverflow) {
-  ::libvpx_test::RandomVideoSource video;
-  video.SetSize(800, 480);
-  video.set_limit(20);
-  cfg_.rc_target_bitrate = 140000000;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  if (IsVP9()) {
+    // TODO(https://crbug.com/webm/1749): This should match VP8.
+    TestIntegerOverflow(800, 480);
+  } else {
+    TestIntegerOverflow(2048, 2048);
+  }
+}
+
+TEST_P(RealtimeTest, IntegerOverflowLarge) {
+  if (IsVP9()) {
+    GTEST_SKIP() << "TODO(https://crbug.com/webm/1750): Enable this test after "
+                    "undefined sanitizer warnings are fixed.";
+    // TestIntegerOverflow(16384, 16384);
+  } else {
+    GTEST_SKIP()
+        << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):"
+        << " Enable this test after bitstream errors & undefined sanitizer "
+           "warnings are fixed.";
+    // TestIntegerOverflow(16383, 16383);
+  }
 }
 
 VP8_INSTANTIATE_TEST_SUITE(RealtimeTest,
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 893b7a5132..ab954c46f2 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -339,7 +339,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     oxcf->end_usage = USAGE_CONSTANT_QUALITY;
   }
 
-  oxcf->target_bandwidth = cfg.rc_target_bitrate;
+  // Cap the target rate to 1000 Mbps to avoid some integer overflows in
+  // target bandwidth calculations.
+  oxcf->target_bandwidth = VPXMIN(cfg.rc_target_bitrate, 1000000);
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
   oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct;
 

From 7e4c6fed0c9b212dcee4787d461042c94dee4468 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Nov 2021 13:42:35 -0800
Subject: [PATCH 171/926] test/DummyVideoSource::ReallocImage: check img_ alloc

prevents a crash on the next line accessing img_ members

Bug: aomedia:3191
Change-Id: I430fb4ee662b0001629096eb8b554f8a2b30cce0
---
 test/video_source.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/video_source.h b/test/video_source.h
index 2ba3f64211..7a2dbe7ef7 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -191,6 +191,7 @@ class DummyVideoSource : public VideoSource {
   void ReallocImage() {
     vpx_img_free(img_);
     img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
+    ASSERT_NE(img_, nullptr);
     raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
   }
 

From 9fb780c5e75584fe198a89f0cf1898cebb542104 Mon Sep 17 00:00:00 2001
From: Mikko Koivisto <mikko.koivisto@unikie.com>
Date: Mon, 15 Nov 2021 18:47:05 +0000
Subject: [PATCH 172/926] vp9: Fix multiplication overflow

Fix UBSan error reported from aosp Cuttlefish device:
/vp9/encoder/vp9_ratectrl.c:238:33: unsigned integer overflow:
2500000 * 1800 cannot be represented in type 'unsigned int'

...by casting the operand and the result of multiplication
to 64bit integer.

Test: vp9 webrtc streaming with Cuttlefish
Change-Id: Id5bb3d4071a96179caffae0829d3cc4e48c7614b
---
 vp9/encoder/vp9_ratectrl.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index e38464c72c..ac346115fb 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -223,9 +223,10 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
 
   if (oxcf->rc_max_inter_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
-    target = VPXMIN(target, max_rate);
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    // target is of type int and VPXMIN cannot evaluate to larger than target
+    target = (int)VPXMIN(target, max_rate);
   }
   return target;
 }
@@ -234,9 +235,9 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   if (oxcf->rc_max_intra_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
-    target = VPXMIN(target, max_rate);
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+    target = (int)VPXMIN(target, max_rate);
   }
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
   return target;

From bf93b61f789bc890b83b0fd34bb9665ec6137638 Mon Sep 17 00:00:00 2001
From: Mikko Koivisto <mikko.koivisto@unikie.com>
Date: Tue, 16 Nov 2021 12:29:21 +0000
Subject: [PATCH 173/926] vp9: fix ubsan sub-overflows

Fix errors reported by UBSan diagnostics:
1. /vp9/encoder/vp9_pickmode.c:308:29: unsigned integer overflow:
   99 - 100 cannot be represented in type 'unsigned int'
2. /vp9/encoder/vp9_pickmode.c:330:27: unsigned integer overflow:
   21976 - 21978 cannot be represented in type 'unsigned int'
3. /vp9/encoder/vp9_pickmode.c:468:13: unsigned integer overflow:
   18852144 - 18852149 cannot be represented in type 'unsigned int'

(Notice that line numbers might vary a bit because fixes have
been applied incrementally i.e. fix for error #1 affects line
number reported in #2)

Fix by calculating difference instead of wrapping around to
a value near maximum.

Test: Cuttlefish webrtc with VP9 codec
Change-Id: I4f85712028647e915a4e2da31e4b0a266e9e2705
---
 vp9/encoder/vp9_pickmode.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 695fd484fc..1a66c0a867 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -268,6 +268,7 @@ static void block_variance(const uint8_t *src, int src_stride,
 #endif
                            uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
   int i, j, k = 0;
+  uint32_t k_sqr = 0;
 
   *sse = 0;
   *sum = 0;
@@ -305,7 +306,8 @@ static void block_variance(const uint8_t *src, int src_stride,
 #endif
       *sse += sse8x8[k];
       *sum += sum8x8[k];
-      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k];
       k++;
     }
   }
@@ -319,6 +321,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
   const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
   const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
   int i, j, k = 0;
+  uint32_t k_sqr = 0;
 
   for (i = 0; i < nh; i += 2) {
     for (j = 0; j < nw; j += 2) {
@@ -326,9 +329,10 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
                  sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
       sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
                  sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
-      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
-                                       (b_width_log2_lookup[unit_size] +
-                                        b_height_log2_lookup[unit_size] + 6));
+      k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+                         (b_width_log2_lookup[unit_size] +
+                          b_height_log2_lookup[unit_size] + 6));
+      var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k];
       k++;
     }
   }
@@ -452,6 +456,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   unsigned int var8x8[64] = { 0 };
   TX_SIZE tx_size;
   int i, k;
+  uint32_t sum_sqr;
 #if CONFIG_VP9_HIGHBITDEPTH
   const vpx_bit_depth_t bd = cpi->common.bit_depth;
 #endif
@@ -463,7 +468,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                  cpi->common.use_highbitdepth, bd,
 #endif
                  sse8x8, sum8x8, var8x8);
-  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+  sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4);
+  var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse;
 
   *var_y = var;
   *sse_y = sse;

From c59de7bc914d40305c1a7b066b9965d809d81533 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Wed, 17 Nov 2021 06:02:15 +0900
Subject: [PATCH 174/926] MacOS 12 is darwin21

Remove -mmacosx-version-min. The library does not use
any calls which are affected by the platform version.
There is also no version 10.16 as it went from 10.15
to 11 and now to 12.

At some point it may be good to clarify that the bare
-darwin- target is for iOS and the -darwinN- targets
are for macOS.

Change-Id: I2fd5f7cae2637905acf3ab77bfddfbe367abbb68
---
 build/make/configure.sh | 8 ++++----
 configure               | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 81d30a16c7..b24e79a0d2 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -774,7 +774,7 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
         ;;
-      *darwin20*)
+      *darwin2[0-1]*)
         tgt_isa=`uname -m`
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
         ;;
@@ -918,9 +918,9 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.15"
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
-    *-darwin20-*)
-      add_cflags  "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}"
-      add_ldflags "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}"
+    *-darwin2[0-1]-*)
+      add_cflags  "-arch ${toolchain%%-*}"
+      add_ldflags "-arch ${toolchain%%-*}"
       ;;
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
diff --git a/configure b/configure
index e3babbe824..d39db6cb08 100755
--- a/configure
+++ b/configure
@@ -100,6 +100,7 @@ EOF
 all_platforms="${all_platforms} arm64-android-gcc"
 all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} arm64-darwin20-gcc"
+all_platforms="${all_platforms} arm64-darwin21-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
@@ -152,6 +153,7 @@ all_platforms="${all_platforms} x86_64-darwin17-gcc"
 all_platforms="${all_platforms} x86_64-darwin18-gcc"
 all_platforms="${all_platforms} x86_64-darwin19-gcc"
 all_platforms="${all_platforms} x86_64-darwin20-gcc"
+all_platforms="${all_platforms} x86_64-darwin21-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"

From c0ba429863bb9f85484430f1c62edf636e4fe6de Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 15 Nov 2021 19:20:36 -0800
Subject: [PATCH 175/926] encode_api_test.cc: unify kCodecs[] definitions

and rename the table to kCodecIfaces[] to be a little more specific and
avoid shadowing kCodecs[] in SetRoi()

Change-Id: I64905f48d8bf76e812bdba8374b82e3f7654686f
---
 test/encode_api_test.cc | 42 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index dec19b2268..6f940b629e 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -19,7 +19,14 @@
 
 namespace {
 
-#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+const vpx_codec_iface_t *kCodecIfaces[] = {
+#if CONFIG_VP8_ENCODER
+  &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+  &vpx_codec_vp9_cx_algo,
+#endif
+};
 
 bool IsVP9(const vpx_codec_iface_t *iface) {
   static const char kVP9Name[] = "WebM Project VP9";
@@ -28,14 +35,6 @@ bool IsVP9(const vpx_codec_iface_t *iface) {
 }
 
 TEST(EncodeAPI, InvalidParams) {
-  static const vpx_codec_iface_t *kCodecs[] = {
-#if CONFIG_VP8_ENCODER
-    &vpx_codec_vp8_cx_algo,
-#endif
-#if CONFIG_VP9_ENCODER
-    &vpx_codec_vp9_cx_algo,
-#endif
-  };
   uint8_t buf[1] = { 0 };
   vpx_image_t img;
   vpx_codec_ctx_t enc;
@@ -58,17 +57,17 @@ TEST(EncodeAPI, InvalidParams) {
             vpx_codec_enc_config_default(nullptr, &cfg, 0));
   EXPECT_NE(vpx_codec_error(nullptr), nullptr);
 
-  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
-    SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i]));
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init(nullptr, kCodecs[i], nullptr, 0));
+              vpx_codec_enc_init(nullptr, iface, nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init(&enc, kCodecs[i], nullptr, 0));
+              vpx_codec_enc_init(&enc, iface, nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_config_default(kCodecs[i], &cfg, 1));
+              vpx_codec_enc_config_default(iface, &cfg, 1));
 
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0));
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(iface, &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, iface, &cfg, 0));
     EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, nullptr, 0, 0, 0, 0));
 
     EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
@@ -124,14 +123,6 @@ TEST(EncodeAPI, ImageSizeSetting) {
 // (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
 // CONFIG_MULTI_RES_ENCODING.
 TEST(EncodeAPI, MultiResEncode) {
-  static const vpx_codec_iface_t *kCodecs[] = {
-#if CONFIG_VP8_ENCODER
-    &vpx_codec_vp8_cx_algo,
-#endif
-#if CONFIG_VP9_ENCODER
-    &vpx_codec_vp9_cx_algo,
-#endif
-  };
   const int width = 1280;
   const int height = 720;
   const int width_down = width / 2;
@@ -139,8 +130,7 @@ TEST(EncodeAPI, MultiResEncode) {
   const int target_bitrate = 1000;
   const int framerate = 30;
 
-  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
-    const vpx_codec_iface_t *const iface = kCodecs[c];
+  for (const auto *iface : kCodecIfaces) {
     vpx_codec_ctx_t enc[2];
     vpx_codec_enc_cfg_t cfg[2];
     vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };

From 4a5a0a9a795782f33b8eb04461f9a5dfc9a146e1 Mon Sep 17 00:00:00 2001
From: Ilya Kurdyukov <jpegqs@gmail.com>
Date: Sat, 13 Nov 2021 18:22:14 +0700
Subject: [PATCH 176/926] faster vp8_regular_quantize_b_sse4_1

Gives 10% faster VP8 encoding in simple tests.
This patch requires testing on wider datasets and encoder
settings to see if this speedup is achieved on most data.

Change-Id: If8e04819623e78fff126c413db66c964c0b4c11a
---
 vp8/encoder/x86/quantize_sse4.c | 95 ++++++++++++++++++++-------------
 vpx_ports/bitops.h              | 23 +++++++-
 2 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c
index 389c16705d..6d03365fcb 100644
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -11,28 +11,14 @@
 #include <smmintrin.h> /* SSE4.1 */
 
 #include "./vp8_rtcd.h"
-#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
 #include "vp8/encoder/block.h"
-
-#define SELECT_EOB(i, z, x, y, q)                         \
-  do {                                                    \
-    short boost = *zbin_boost_ptr;                        \
-    /* Technically _mm_extract_epi16() returns an int: */ \
-    /* https://bugs.llvm.org/show_bug.cgi?id=41657 */     \
-    short x_z = (short)_mm_extract_epi16(x, z);           \
-    short y_z = (short)_mm_extract_epi16(y, z);           \
-    int cmp = (x_z < boost) | (y_z == 0);                 \
-    zbin_boost_ptr++;                                     \
-    if (cmp) break;                                       \
-    q = _mm_insert_epi16(q, y_z, z);                      \
-    eob = i;                                              \
-    zbin_boost_ptr = b->zrun_zbin_boost;                  \
-  } while (0)
+#include "vpx_ports/bitops.h" /* get_lsb */
 
 void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
-  char eob = 0;
+  int eob = -1;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
-
+  __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr));
+  __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8));
   __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
@@ -47,8 +33,12 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
   __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
   __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
-  __m128i qcoeff0 = _mm_setzero_si128();
-  __m128i qcoeff1 = _mm_setzero_si128();
+  __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1;
+  uint32_t mask, ymask;
+  DECLARE_ALIGNED(16, static const uint8_t,
+                  zig_zag_mask[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                        9, 12, 13, 10, 7, 11, 14, 15 };
+  DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 };
 
   /* Duplicate to all lanes. */
   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
@@ -88,23 +78,52 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   y0 = _mm_sign_epi16(y0, z0);
   y1 = _mm_sign_epi16(y1, z1);
 
-  /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
-  SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+  {
+    const __m128i zig_zag_i16_0 =
+        _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13);
+    const __m128i zig_zag_i16_1 =
+        _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13);
+
+    /* The first part of the zig zag needs a value
+     * from x_minus_zbin1 and vice versa. */
+    t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2);
+    t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80);
+    t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80);
+    x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0);
+    x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1);
+  }
+
+  /* Check if y is nonzero and put it in zig zag order. */
+  t0 = _mm_packs_epi16(y0, y1);
+  t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128());
+  t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask));
+  ymask = _mm_movemask_epi8(t0) ^ 0xffff;
+
+  for (;;) {
+    t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0);
+    t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1);
+    t0 = _mm_packs_epi16(t0, t1);
+    mask = _mm_movemask_epi8(t0);
+    mask = ~mask & ymask;
+    if (!mask) break;
+    /* |eob| will contain the index of the next found element where:
+     * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */
+    eob = get_lsb(mask);
+    /* Need to clear the mask from processed elements so that
+     * they are no longer counted in the next iteration. */
+    ymask &= ~1U << eob;
+    /* It's safe to read ahead of this buffer if struct VP8_COMP has at
+     * least 32 bytes before the zrun_zbin_boost_* fields (it has 384).
+     * Any data read outside of the buffer is masked by the updated |ymask|. */
+    zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1));
+    zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7));
+    qcoeff[zig_zag_mask[eob]] = 0xffff;
+  }
+
+  qcoeff0 = _mm_load_si128((__m128i *)(qcoeff));
+  qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8));
+  qcoeff0 = _mm_and_si128(qcoeff0, y0);
+  qcoeff1 = _mm_and_si128(qcoeff1, y1);
 
   _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
   _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
@@ -115,5 +134,5 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
   _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
 
-  *d->eob = eob;
+  *d->eob = eob + 1;
 }
diff --git a/vpx_ports/bitops.h b/vpx_ports/bitops.h
index 5b2f31cd11..1b5cdaa6dd 100644
--- a/vpx_ports/bitops.h
+++ b/vpx_ports/bitops.h
@@ -26,20 +26,32 @@
 extern "C" {
 #endif
 
-// These versions of get_msb() are only valid when n != 0 because all
-// of the optimized versions are undefined when n == 0:
+// These versions of get_lsb() and get_msb() are only valid when n != 0
+// because all of the optimized versions are undefined when n == 0:
 // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
 
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_lsb(unsigned int n) {
+  assert(n != 0);
+  return __builtin_ctz(n);
+}
+
 static INLINE int get_msb(unsigned int n) {
   assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
 #elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanForward)
 #pragma intrinsic(_BitScanReverse)
 
+static INLINE int get_lsb(unsigned int n) {
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+
 static INLINE int get_msb(unsigned int n) {
   unsigned long first_set_bit;
   assert(n != 0);
@@ -48,6 +60,13 @@ static INLINE int get_msb(unsigned int n) {
 }
 #undef USE_MSC_INTRINSICS
 #else
+static INLINE int get_lsb(unsigned int n) {
+  int i;
+  assert(n != 0);
+  for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1;
+  return i;
+}
+
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {
   int log = 0;

From 87ce2bc3e3266e670e0da71d7915c3c40e948c15 Mon Sep 17 00:00:00 2001
From: Ilya Kurdyukov <jpegqs@gmail.com>
Date: Wed, 17 Nov 2021 14:16:02 +0700
Subject: [PATCH 177/926] replaced bsr() with get_msb() from bitops.h

The modified line should now compile into two instructions instead of four.

Change-Id: Ie2eb6b13ff1e29b3107cb9e76f37ff9065504316
---
 vp8/encoder/x86/vp8_quantize_ssse3.c | 33 +++++-----------------------
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/vp8/encoder/x86/vp8_quantize_ssse3.c b/vp8/encoder/x86/vp8_quantize_ssse3.c
index 147c30cc35..f6df146f08 100644
--- a/vp8/encoder/x86/vp8_quantize_ssse3.c
+++ b/vp8/encoder/x86/vp8_quantize_ssse3.c
@@ -12,31 +12,7 @@
 
 #include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
-
-/* bitscan reverse (bsr) */
-#if defined(_MSC_VER)
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-static int bsr(int mask) {
-  unsigned long eob;
-  _BitScanReverse(&eob, mask);
-  eob++;
-  if (mask == 0) eob = 0;
-  return eob;
-}
-#else
-static int bsr(int mask) {
-  int eob;
-#if defined(__GNUC__) && __GNUC__
-  __asm__ __volatile__("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags");
-#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-  asm volatile("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags");
-#endif
-  eob++;
-  if (mask == 0) eob = 0;
-  return eob;
-}
-#endif
+#include "vpx_ports/bitops.h" /* get_msb */
 
 void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
   int eob, mask;
@@ -108,7 +84,10 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
 
   mask = _mm_movemask_epi8(x);
 
-  eob = bsr(mask);
+  /* x2 is needed to increase the result from non-zero masks by 1,
+   * +1 is needed to mask undefined behavior for a null argument,
+   * the result of get_msb(1) is 0 */
+  eob = get_msb(mask * 2 + 1);
 
-  *d->eob = 0xFF & eob;
+  *d->eob = eob;
 }

From 2d9e4d3c7a07aa50dc5e12c59d190f28b9e1bcb7 Mon Sep 17 00:00:00 2001
From: Fyodor Kyslov <kyslov@google.com>
Date: Wed, 17 Nov 2021 13:15:00 -0800
Subject: [PATCH 178/926] vp9 encoder: fix some integer overflows

cap bitrate to 1000Mbps, change bitsaving budget to int64_t

this make test coverage for 2048x2048 - same as for vp8

Bug: webm:1749
Fixed: webm:1749
Change-Id: Ic58d73cb7529b0826d1f501ad09af8e80f706a6e
---
 test/realtime_test.cc       | 9 +--------
 vp9/encoder/vp9_bitstream.c | 2 +-
 vp9/vp9_cx_iface.c          | 5 +++--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index b32a35513c..ab2080a85d 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -81,14 +81,7 @@ TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
   EXPECT_EQ(kFramesToEncode, frame_packets_);
 }
 
-TEST_P(RealtimeTest, IntegerOverflow) {
-  if (IsVP9()) {
-    // TODO(https://crbug.com/webm/1749): This should match VP8.
-    TestIntegerOverflow(800, 480);
-  } else {
-    TestIntegerOverflow(2048, 2048);
-  }
-}
+TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
 
 TEST_P(RealtimeTest, IntegerOverflowLarge) {
   if (IsVP9()) {
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3eff4ce830..c23e150a45 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -554,7 +554,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
   switch (cpi->sf.use_fast_coef_updates) {
     case TWO_LOOP: {
       /* dry run to see if there is any update at all needed */
-      int savings = 0;
+      int64_t savings = 0;
       int update[2] = { 0, 0 };
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 48d555532d..cc4081c4f5 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -523,8 +523,9 @@ static vpx_codec_err_t set_encoder_config(
   raw_target_rate =
       (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 *
                      oxcf->init_framerate / 1000);
-  // Cap target bitrate to raw rate
-  cfg->rc_target_bitrate = VPXMIN(raw_target_rate, cfg->rc_target_bitrate);
+  // Cap target bitrate to raw rate or 1000Mbps, whichever is less
+  cfg->rc_target_bitrate =
+      VPXMIN(VPXMIN(raw_target_rate, cfg->rc_target_bitrate), 1000000);
 
   // Convert target bandwidth from Kbit/s to Bit/s
   oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate;

From 1794f6db24d400ddc834b543510c547d777216cb Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 16 Nov 2021 18:21:34 -0800
Subject: [PATCH 179/926] vp9 encoder: fix row-mt crash w/thread config change

previously row-mt would allocate thread data once, so increasing the
number of threads with a config change would cause a heap overflow.

Bug: chromium:1261415
Bug: chromium:1270689
Change-Id: I3c5ec8444ae91964fa34a19dd780bd2cbb0368bf
---
 test/encode_api_test.cc   |  60 +++++++++++++++++++++
 vp9/encoder/vp9_encoder.c |  25 ++-------
 vp9/encoder/vp9_ethread.c | 110 +++++++++++++++++++++++---------------
 vp9/encoder/vp9_ethread.h |   5 ++
 4 files changed, 134 insertions(+), 66 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 6f940b629e..6f61c77502 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -10,10 +10,12 @@
 
 #include <climits>
 #include <cstring>
+#include <initializer_list>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
+#include "test/video_source.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
@@ -300,4 +302,62 @@ TEST(EncodeAPI, SetRoi) {
   }
 }
 
+void InitCodec(const vpx_codec_iface_t &iface, int width, int height,
+               vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) {
+  ASSERT_EQ(vpx_codec_enc_config_default(&iface, cfg, 0), VPX_CODEC_OK);
+  cfg->g_w = width;
+  cfg->g_h = height;
+  cfg->g_lag_in_frames = 0;
+  cfg->g_pass = VPX_RC_ONE_PASS;
+  ASSERT_EQ(vpx_codec_enc_init(enc, &iface, cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control_(enc, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK);
+}
+
+// Encodes 1 frame of size |cfg.g_w| x |cfg.g_h| setting |enc|'s configuration
+// to |cfg|.
+void EncodeWithConfig(const vpx_codec_enc_cfg_t &cfg, vpx_codec_ctx_t *enc) {
+  libvpx_test::DummyVideoSource video;
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.Begin();
+  EXPECT_EQ(vpx_codec_enc_config_set(enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(enc);
+
+  EXPECT_EQ(vpx_codec_encode(enc, video.img(), video.pts(), video.duration(),
+                             /*flags=*/0, VPX_DL_GOOD_QUALITY),
+            VPX_CODEC_OK)
+      << vpx_codec_error_detail(enc);
+}
+
+TEST(EncodeAPI, ConfigChangeThreadCount) {
+  constexpr int kWidth = 1920;
+  constexpr int kHeight = 1080;
+
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
+      vpx_codec_enc_cfg_t cfg;
+      struct Encoder {
+        ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+        vpx_codec_ctx_t ctx = {};
+      } enc;
+
+      EXPECT_NO_FATAL_FAILURE(
+          InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg));
+      if (IsVP9(iface)) {
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6),
+                  VPX_CODEC_OK);
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i),
+                  VPX_CODEC_OK);
+      }
+
+      for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
+        cfg.g_threads = threads;
+        EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+            << "iteration: " << i << " threads: " << threads;
+      }
+    }
+  }
+}
+
 }  // namespace
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 7e80835f6c..8fdd86916f 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2676,7 +2676,6 @@ static void free_tpl_buffer(VP9_COMP *cpi);
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *cm;
   unsigned int i;
-  int t;
 
   if (!cpi) return;
 
@@ -2789,28 +2788,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 
   free_tpl_buffer(cpi);
 
-  for (t = 0; t < cpi->num_workers; ++t) {
-    VPxWorker *const worker = &cpi->workers[t];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
-
-    // Deallocate allocated threads.
-    vpx_get_worker_interface()->end(worker);
-
-    // Deallocate allocated thread data.
-    if (t < cpi->num_workers - 1) {
-      vpx_free(thread_data->td->counts);
-      vp9_free_pc_tree(thread_data->td);
-      vpx_free(thread_data->td);
-    }
-  }
-  vpx_free(cpi->tile_thr_data);
-  vpx_free(cpi->workers);
+  vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
   vp9_row_mt_mem_dealloc(cpi);
-
-  if (cpi->num_workers > 1) {
-    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
-    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
-  }
+  vp9_encode_free_mt_data(cpi);
 
 #if !CONFIG_REALTIME_ONLY
   vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index e7f8a537d4..453fe2e0df 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
@@ -79,60 +81,59 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
   VP9_COMMON *const cm = &cpi->common;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   int i;
+  // While using SVC, we need to allocate threads according to the highest
+  // resolution. When row based multithreading is enabled, it is OK to
+  // allocate more threads than the number of max tile columns.
+  if (cpi->use_svc && !cpi->row_mt) {
+    int max_tile_cols = get_max_tile_cols(cpi);
+    num_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
+  }
+  assert(num_workers > 0);
+  if (num_workers == cpi->num_workers) return;
+  vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  vp9_encode_free_mt_data(cpi);
 
-  // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
-    int allocated_workers = num_workers;
-
-    // While using SVC, we need to allocate threads according to the highest
-    // resolution. When row based multithreading is enabled, it is OK to
-    // allocate more threads than the number of max tile columns.
-    if (cpi->use_svc && !cpi->row_mt) {
-      int max_tile_cols = get_max_tile_cols(cpi);
-      allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
-    }
-
-    CHECK_MEM_ERROR(cm, cpi->workers,
-                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
+  CHECK_MEM_ERROR(cm, cpi->workers,
+                  vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
-    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                    vpx_calloc(allocated_workers, sizeof(*cpi->tile_thr_data)));
+  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                  vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 
-    for (i = 0; i < allocated_workers; i++) {
-      VPxWorker *const worker = &cpi->workers[i];
-      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data = &cpi->tile_thr_data[i];
 
-      ++cpi->num_workers;
-      winterface->init(worker);
+    ++cpi->num_workers;
+    winterface->init(worker);
 
-      if (i < allocated_workers - 1) {
-        thread_data->cpi = cpi;
+    if (i < num_workers - 1) {
+      thread_data->cpi = cpi;
 
-        // Allocate thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td,
-                        vpx_memalign(32, sizeof(*thread_data->td)));
-        vp9_zero(*thread_data->td);
+      // Allocate thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td,
+                      vpx_memalign(32, sizeof(*thread_data->td)));
+      vp9_zero(*thread_data->td);
 
-        // Set up pc_tree.
-        thread_data->td->leaf_tree = NULL;
-        thread_data->td->pc_tree = NULL;
-        vp9_setup_pc_tree(cm, thread_data->td);
+      // Set up pc_tree.
+      thread_data->td->leaf_tree = NULL;
+      thread_data->td->pc_tree = NULL;
+      vp9_setup_pc_tree(cm, thread_data->td);
 
-        // Allocate frame counters in thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td->counts,
-                        vpx_calloc(1, sizeof(*thread_data->td->counts)));
+      // Allocate frame counters in thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                      vpx_calloc(1, sizeof(*thread_data->td->counts)));
 
-        // Create threads
-        if (!winterface->reset(worker))
-          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                             "Tile encoder thread creation failed");
-      } else {
-        // Main thread acts as a worker and uses the thread data in cpi.
-        thread_data->cpi = cpi;
-        thread_data->td = &cpi->td;
-      }
-      winterface->sync(worker);
+      // Create threads
+      if (!winterface->reset(worker))
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->cpi = cpi;
+      thread_data->td = &cpi->td;
     }
+    winterface->sync(worker);
   }
 }
 
@@ -169,6 +170,27 @@ static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
   }
 }
 
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi) {
+  int t;
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VPxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    vpx_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp9_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+  }
+  vpx_free(cpi->tile_thr_data);
+  vpx_free(cpi->workers);
+  cpi->num_workers = 0;
+}
+
 void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h
index cda0293bcf..4c192da515 100644
--- a/vp9/encoder/vp9_ethread.h
+++ b/vp9/encoder/vp9_ethread.h
@@ -42,6 +42,11 @@ typedef struct VP9RowMTSyncData {
   int rows;
 } VP9RowMTSync;
 
+// Frees EncWorkerData related allocations made by vp9_encode_*_mt().
+// row_mt specific data is freed with vp9_row_mt_mem_dealloc() and is not
+// called by this function.
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi);
+
 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
 
 void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi);

From 31b954debe62026f957e9a13354c8c75b12e537a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 2 Dec 2021 13:11:56 -0800
Subject: [PATCH 180/926] clear -Wextra-semi/-Wextra-semi-stmt warnings

Bug: chromium:1257449
Change-Id: Ia9aafccc09b611521d4a7aedfe3723393a840c62
---
 configure                                  |   2 +
 examples/postproc.c                        |   2 +-
 examples/vpx_temporal_svc_encoder.c        |   2 +-
 test/pp_filter_test.cc                     |   4 +-
 test/set_roi.cc                            |   2 +-
 test/vp8_fdct4x4_test.cc                   |   4 +-
 test/yuv_temporal_filter_test.cc           |   8 +-
 tools/tiny_ssim.c                          |  30 ++--
 vp8/common/blockd.h                        |   2 +-
 vp8/common/common.h                        |  12 +-
 vp8/encoder/encodeframe.c                  |  13 +-
 vp8/encoder/encodemv.c                     |   7 +-
 vp8/encoder/mcomp.c                        |  54 +++---
 vp8/encoder/onyx_if.c                      |   2 +-
 vp9/common/vp9_common.h                    |   4 +-
 vp9/encoder/vp9_mcomp.c                    | 194 +++++++++++----------
 vpx/src/vpx_encoder.c                      |   2 +-
 vpx_dsp/x86/highbd_convolve_avx2.c         |  12 +-
 vpx_dsp/x86/highbd_variance_sse2.c         |  86 ++++-----
 vpx_dsp/x86/sad_avx2.c                     |  36 ++--
 vpx_dsp/x86/variance_sse2.c                |  60 +++----
 vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c  |  24 +--
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c  |  12 +-
 vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c |  12 +-
 vpx_ports/x86.h                            |   8 +-
 25 files changed, 307 insertions(+), 287 deletions(-)

diff --git a/configure b/configure
index d39db6cb08..b68f9fd781 100755
--- a/configure
+++ b/configure
@@ -622,6 +622,8 @@ process_toolchain() {
         check_add_cflags -Wall
         check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
+        check_add_cflags -Wextra-semi
+        check_add_cflags -Wextra-semi-stmt
         check_add_cflags -Wfloat-conversion
         check_add_cflags -Wformat=2
         check_add_cflags -Wparentheses-equality
diff --git a/examples/postproc.c b/examples/postproc.c
index be999b429e..b53c15ea15 100644
--- a/examples/postproc.c
+++ b/examples/postproc.c
@@ -109,7 +109,7 @@ int main(int argc, char **argv) {
                                 0 };
       if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
         die_codec(&codec, "Failed to turn on postproc.");
-    };
+    }
 
     // Decode the frame with 15ms deadline
     if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000))
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 47f30751eb..bb761a4117 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -30,7 +30,7 @@
 
 #define ROI_MAP 0
 
-#define zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
 static const char *exec_name;
 
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index a511ffbe98..775f7f36a3 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -115,7 +115,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   }
 
   vpx_free(flimits_);
-};
+}
 
 TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // Size of the underlying data block that will be filtered.
@@ -214,7 +214,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) {
   PrintMedian("16x16");
 
   vpx_free(flimits_);
-};
+}
 
 class VpxMbPostProcAcrossIpTest
     : public AbstractBench,
diff --git a/test/set_roi.cc b/test/set_roi.cc
index f639547523..167cf908fd 100644
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -161,6 +161,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
   // Free allocated memory
   if (cpi.segmentation_map) vpx_free(cpi.segmentation_map);
   if (roi_map) vpx_free(roi_map);
-};
+}
 
 }  // namespace
diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index d5ac253003..3e4305be73 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -148,7 +148,7 @@ TEST_P(FdctTest, SignBiasCheck) {
 
   EXPECT_EQ(true, bias_acceptable)
       << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
-};
+}
 
 TEST_P(FdctTest, RoundTripErrorCheck) {
   int max_error = 0;
@@ -181,7 +181,7 @@ TEST_P(FdctTest, RoundTripErrorCheck) {
 
   EXPECT_GE(count_test_block, total_error)
       << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
-};
+}
 
 INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c));
 
diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc
index cfdc88d896..2bdcf4d86f 100644
--- a/test/yuv_temporal_filter_test.cc
+++ b/test/yuv_temporal_filter_test.cc
@@ -674,8 +674,8 @@ TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
          v_count);                                                            \
   }
 
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10);
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12)
 
 INSTANTIATE_TEST_SUITE_P(
     C, YUVTemporalFilterTest,
@@ -683,8 +683,8 @@ INSTANTIATE_TEST_SUITE_P(
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10),
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12)));
 #if HAVE_SSE4_1
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10);
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12)
 
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, YUVTemporalFilterTest,
diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c
index ff4634ade4..1577970488 100644
--- a/tools/tiny_ssim.c
+++ b/tools/tiny_ssim.c
@@ -425,20 +425,24 @@ int main(int argc, char *argv[]) {
       break;
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                            \
-  if (bit_depth < 9) {                                                         \
-    ssim = ssim2(buf0, buf1, w, w, w, h);                                      \
-    psnr = calc_plane_error(buf0, w, buf1, w, w, h);                           \
-  } else {                                                                     \
-    ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \
-                        w, w, h, bit_depth);                                   \
-    psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                       \
-                              CAST_TO_SHORTPTR(buf1), w, w, h);                \
-  }
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                           \
+  do {                                                                        \
+    if (bit_depth < 9) {                                                      \
+      ssim = ssim2(buf0, buf1, w, w, w, h);                                   \
+      psnr = calc_plane_error(buf0, w, buf1, w, w, h);                        \
+    } else {                                                                  \
+      ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), \
+                          w, w, w, h, bit_depth);                             \
+      psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                    \
+                                CAST_TO_SHORTPTR(buf1), w, w, h);             \
+    }                                                                         \
+  } while (0)
 #else
-#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
-  ssim = ssim2(buf0, buf1, w, w, w, h);             \
-  psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)  \
+  do {                                               \
+    ssim = ssim2(buf0, buf1, w, w, w, h);            \
+    psnr = calc_plane_error(buf0, w, buf1, w, w, h); \
+  } while (0)
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (n_frames == allocated_frames) {
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 02abe053cb..405443449d 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -58,7 +58,7 @@ typedef struct {
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];
 
-#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B);
+#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B)
 
 typedef enum { KEY_FRAME = 0, INTER_FRAME = 1 } FRAME_TYPE;
 
diff --git a/vp8/common/common.h b/vp8/common/common.h
index 2c30e8d6c5..562569f9ab 100644
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -24,22 +24,22 @@ extern "C" {
 /* Only need this for fixed-size arrays, for structs just assign. */
 
 #define vp8_copy(Dest, Src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(Dest) == sizeof(Src)); \
     memcpy(Dest, Src, sizeof(Src));      \
-  }
+  } while (0)
 
 /* Use this for variably-sized arrays. */
 
 #define vp8_copy_array(Dest, Src, N)           \
-  {                                            \
+  do {                                         \
     assert(sizeof(*(Dest)) == sizeof(*(Src))); \
     memcpy(Dest, Src, (N) * sizeof(*(Src)));   \
-  }
+  } while (0)
 
-#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
-#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)));
+#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)))
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 69271f1a73..4df35f6edb 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -639,7 +639,8 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi) {
 
   vp8_zero(x->coef_counts);
   vp8_zero(x->ymode_count);
-  vp8_zero(x->uv_mode_count) x->prediction_error = 0;
+  vp8_zero(x->uv_mode_count);
+  x->prediction_error = 0;
   x->intra_error = 0;
   vp8_zero(x->count_mb_ref_frame_usage);
 }
@@ -766,12 +767,12 @@ void vp8_encode_frame(VP8_COMP *cpi) {
 
       for (mb_row = 0; mb_row < cm->mb_rows;
            mb_row += (cpi->encoding_thread_count + 1)) {
-        vp8_zero(cm->left_context)
+        vp8_zero(cm->left_context);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-            tp = cpi->tok;
+        tp = cpi->tok;
 #else
-            tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+        tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
 #endif
 
         encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
@@ -858,10 +859,10 @@ void vp8_encode_frame(VP8_COMP *cpi) {
 
       /* for each macroblock row in image */
       for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-        vp8_zero(cm->left_context)
+        vp8_zero(cm->left_context);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-            tp = cpi->tok;
+        tp = cpi->tok;
 #endif
 
         encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index ff38965393..c88ea1653e 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -205,8 +205,11 @@ static void write_component_probs(vp8_writer *const w,
   (void)rc;
   vp8_copy_array(Pnew, default_mvc, MVPcount);
 
-  vp8_zero(is_short_ct) vp8_zero(sign_ct) vp8_zero(bit_ct) vp8_zero(short_ct)
-      vp8_zero(short_bct)
+  vp8_zero(is_short_ct);
+  vp8_zero(sign_ct);
+  vp8_zero(bit_ct);
+  vp8_zero(short_ct);
+  vp8_zero(short_bct);
 
   /* j=0 */
   {
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 9e7f5c7ace..4ab6c7b3d0 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -204,19 +204,21 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) {
 /* returns distortion + motion vector cost */
 #define ERR(r, c) (MVC(r, c) + DIST(r, c))
 /* checks if (r,c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                           \
-  IFMVCV(r, c,                                          \
-         {                                              \
-           thismse = DIST(r, c);                        \
-           if ((v = (MVC(r, c) + thismse)) < besterr) { \
-             besterr = v;                               \
-             br = r;                                    \
-             bc = c;                                    \
-             *distortion = thismse;                     \
-             *sse1 = sse;                               \
-           }                                            \
-         },                                             \
-         v = UINT_MAX;)
+#define CHECK_BETTER(v, r, c)                             \
+  do {                                                    \
+    IFMVCV(r, c,                                          \
+           {                                              \
+             thismse = DIST(r, c);                        \
+             if ((v = (MVC(r, c) + thismse)) < besterr) { \
+               besterr = v;                               \
+               br = r;                                    \
+               bc = c;                                    \
+               *distortion = thismse;                     \
+               *sse1 = sse;                               \
+             }                                            \
+           },                                             \
+           v = UINT_MAX;)                                 \
+  } while (0)
 
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
@@ -800,13 +802,13 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 }
 
 #define CHECK_BOUNDS(range)                    \
-  {                                            \
+  do {                                         \
     all_in = 1;                                \
     all_in &= ((br - range) >= x->mv_row_min); \
     all_in &= ((br + range) <= x->mv_row_max); \
     all_in &= ((bc - range) >= x->mv_col_min); \
     all_in &= ((bc + range) <= x->mv_col_max); \
-  }
+  } while (0)
 
 #define CHECK_POINT                                  \
   {                                                  \
@@ -817,7 +819,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   }
 
 #define CHECK_BETTER                                                     \
-  {                                                                      \
+  do {                                                                   \
     if (thissad < bestsad) {                                             \
       thissad +=                                                         \
           mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); \
@@ -826,7 +828,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
         best_site = i;                                                   \
       }                                                                  \
     }                                                                    \
-  }
+  } while (0)
 
 static const MV next_chkpts[6][3] = {
   { { -2, 0 }, { -1, -2 }, { 1, -2 } }, { { -1, -2 }, { 1, -2 }, { 2, 0 } },
@@ -901,7 +903,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 #endif
 
   /* hex search */
-  CHECK_BOUNDS(2)
+  CHECK_BOUNDS(2);
 
   if (all_in) {
     for (i = 0; i < 6; ++i) {
@@ -910,7 +912,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
                     this_mv.as_mv.col;
       thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-      CHECK_BETTER
+      CHECK_BETTER;
     }
   } else {
     for (i = 0; i < 6; ++i) {
@@ -920,7 +922,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
                     this_mv.as_mv.col;
       thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-      CHECK_BETTER
+      CHECK_BETTER;
     }
   }
 
@@ -934,7 +936,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 
   for (j = 1; j < hex_range; ++j) {
     best_site = -1;
-    CHECK_BOUNDS(2)
+    CHECK_BOUNDS(2);
 
     if (all_in) {
       for (i = 0; i < 3; ++i) {
@@ -943,7 +945,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     } else {
       for (i = 0; i < 3; ++i) {
@@ -953,7 +955,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     }
 
@@ -975,7 +977,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 cal_neighbors:
   for (j = 0; j < dia_range; ++j) {
     best_site = -1;
-    CHECK_BOUNDS(1)
+    CHECK_BOUNDS(1);
 
     if (all_in) {
       for (i = 0; i < 4; ++i) {
@@ -984,7 +986,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     } else {
       for (i = 0; i < 4; ++i) {
@@ -994,7 +996,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     }
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 15c9d72f5d..59bce951e0 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1023,7 +1023,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
 
       memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
 
-  }; /* switch */
+  } /* switch */
 
   /* Slow quant, dct and trellis not worthwhile for first pass
    * so make sure they are always turned off.
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index e3c5535ddb..3cec53bfd8 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -27,10 +27,10 @@ extern "C" {
 
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(dest) == sizeof(src)); \
     memcpy(dest, src, sizeof(src));      \
-  }
+  } while (0)
 
 // Use this for variably-sized arrays.
 #define vp9_copy_array(dest, src, n)           \
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index ac29f36ec1..cd67064203 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -159,59 +159,63 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    int64_t tmpmse;                                                          \
-    const MV mv = { r, c };                                                  \
-    const MV ref_mv = { rr, rc };                                            \
-    if (second_pred == NULL) {                                               \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    } else {                                                                 \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    }                                                                        \
-    tmpmse = thismse;                                                        \
-    tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);     \
-    if (tmpmse >= INT_MAX) {                                                 \
-      v = INT_MAX;                                                           \
-    } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
-  }
+#define CHECK_BETTER(v, r, c)                                                  \
+  do {                                                                         \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+      int64_t tmpmse;                                                          \
+      const MV mv = { r, c };                                                  \
+      const MV ref_mv = { rr, rc };                                            \
+      if (second_pred == NULL) {                                               \
+        thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
+                           src_stride, &sse);                                  \
+      } else {                                                                 \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                            src_stride, &sse, second_pred);                    \
+      }                                                                        \
+      tmpmse = thismse;                                                        \
+      tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);     \
+      if (tmpmse >= INT_MAX) {                                                 \
+        v = INT_MAX;                                                           \
+      } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
+        besterr = v;                                                           \
+        br = r;                                                                \
+        bc = c;                                                                \
+        *distortion = thismse;                                                 \
+        *sse1 = sse;                                                           \
+      }                                                                        \
+    } else {                                                                   \
+      v = INT_MAX;                                                             \
+    }                                                                          \
+  } while (0)
 #else
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    const MV mv = { r, c };                                                  \
-    const MV ref_mv = { rr, rc };                                            \
-    if (second_pred == NULL)                                                 \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    else                                                                     \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +     \
-             thismse) < besterr) {                                           \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
-  }
+#define CHECK_BETTER(v, r, c)                                                  \
+  do {                                                                         \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+      const MV mv = { r, c };                                                  \
+      const MV ref_mv = { rr, rc };                                            \
+      if (second_pred == NULL)                                                 \
+        thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
+                           src_stride, &sse);                                  \
+      else                                                                     \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                            src_stride, &sse, second_pred);                    \
+      if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +     \
+               thismse) < besterr) {                                           \
+        besterr = v;                                                           \
+        br = r;                                                                \
+        bc = c;                                                                \
+        *distortion = thismse;                                                 \
+        *sse1 = sse;                                                           \
+      }                                                                        \
+    } else {                                                                   \
+      v = INT_MAX;                                                             \
+    }                                                                          \
+  } while (0)
 
 #endif
 #define FIRST_LEVEL_CHECKS                                       \
-  {                                                              \
+  do {                                                           \
     unsigned int left, right, up, down, diag;                    \
     CHECK_BETTER(left, tr, tc - hstep);                          \
     CHECK_BETTER(right, tr, tc + hstep);                         \
@@ -224,10 +228,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
       case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
       case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
     }                                                            \
-  }
+  } while (0)
 
 #define SECOND_LEVEL_CHECKS                                       \
-  {                                                               \
+  do {                                                            \
     int kr, kc;                                                   \
     unsigned int second;                                          \
     if (tr != br && tc != bc) {                                   \
@@ -256,7 +260,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
         case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
       }                                                           \
     }                                                             \
-  }
+  } while (0)
 
 #define SETUP_SUBPEL_SEARCH                                                 \
   const uint8_t *const z = x->plane[0].src.buf;                             \
@@ -290,7 +294,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   maxr = subpel_mv_limits.row_max;                                          \
                                                                             \
   bestmv->row *= 8;                                                         \
-  bestmv->col *= 8;
+  bestmv->col *= 8
 
 static unsigned int setup_center_error(
     const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
@@ -678,48 +682,52 @@ static int accurate_sub_pel_search(
 // TODO(yunqing): this part can be further refactored.
 #if CONFIG_VP9_HIGHBITDEPTH
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                                 \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
-    int64_t tmpmse;                                                            \
-    const MV mv = { r, c };                                                    \
-    const MV ref_mv = { rr, rc };                                              \
-    thismse =                                                                  \
-        accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
-                                y, y_stride, second_pred, w, h, &sse);         \
-    tmpmse = thismse;                                                          \
-    tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);       \
-    if (tmpmse >= INT_MAX) {                                                   \
-      v = INT_MAX;                                                             \
-    } else if ((v = (uint32_t)tmpmse) < besterr) {                             \
-      besterr = v;                                                             \
-      br = r;                                                                  \
-      bc = c;                                                                  \
-      *distortion = thismse;                                                   \
-      *sse1 = sse;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    v = INT_MAX;                                                               \
-  }
+#define CHECK_BETTER1(v, r, c)                                                \
+  do {                                                                        \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+      int64_t tmpmse;                                                         \
+      const MV mv = { r, c };                                                 \
+      const MV ref_mv = { rr, rc };                                           \
+      thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z,    \
+                                        src_stride, y, y_stride, second_pred, \
+                                        w, h, &sse);                          \
+      tmpmse = thismse;                                                       \
+      tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);    \
+      if (tmpmse >= INT_MAX) {                                                \
+        v = INT_MAX;                                                          \
+      } else if ((v = (uint32_t)tmpmse) < besterr) {                          \
+        besterr = v;                                                          \
+        br = r;                                                               \
+        bc = c;                                                               \
+        *distortion = thismse;                                                \
+        *sse1 = sse;                                                          \
+      }                                                                       \
+    } else {                                                                  \
+      v = INT_MAX;                                                            \
+    }                                                                         \
+  } while (0)
 #else
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                                 \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
-    const MV mv = { r, c };                                                    \
-    const MV ref_mv = { rr, rc };                                              \
-    thismse =                                                                  \
-        accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
-                                y, y_stride, second_pred, w, h, &sse);         \
-    if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +       \
-             thismse) < besterr) {                                             \
-      besterr = v;                                                             \
-      br = r;                                                                  \
-      bc = c;                                                                  \
-      *distortion = thismse;                                                   \
-      *sse1 = sse;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    v = INT_MAX;                                                               \
-  }
+#define CHECK_BETTER1(v, r, c)                                                \
+  do {                                                                        \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+      const MV mv = { r, c };                                                 \
+      const MV ref_mv = { rr, rc };                                           \
+      thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z,    \
+                                        src_stride, y, y_stride, second_pred, \
+                                        w, h, &sse);                          \
+      if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +    \
+               thismse) < besterr) {                                          \
+        besterr = v;                                                          \
+        br = r;                                                               \
+        bc = c;                                                               \
+        *distortion = thismse;                                                \
+        *sse1 = sse;                                                          \
+      }                                                                       \
+    } else {                                                                  \
+      v = INT_MAX;                                                            \
+    }                                                                         \
+  } while (0)
 
 #endif
 
@@ -2962,7 +2970,7 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
   (void)sse;           \
   (void)thismse;       \
   (void)cost_list;     \
-  (void)use_accurate_subpel_search;
+  (void)use_accurate_subpel_search
 
 // Return the maximum MV.
 uint32_t vp9_return_max_sub_pixel_mv(
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index f636b54a33..846638fe55 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -173,7 +173,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
 #include "vpx_ports/x86.h"
 #define FLOATING_POINT_INIT() \
   do {                        \
-    unsigned short x87_orig_mode = x87_set_double_precision();
+  unsigned short x87_orig_mode = x87_set_double_precision()
 #define FLOATING_POINT_RESTORE()       \
   x87_set_control_word(x87_orig_mode); \
   }                                    \
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
index 3209625617..01a52ec8bf 100644
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1465,10 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_avx2
 
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), , avx2, 0);
-HIGH_FUN_CONV_2D(, avx2, 0);
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
+HIGH_FUN_CONV_2D(, avx2, 0)
 
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1487,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
-HIGH_FUN_CONV_2D(avg_, avx2, 1);
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+HIGH_FUN_CONV_2D(avg_, avx2, 1)
 
 #undef HIGHBD_FUNC
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index dd6cfbb2c4..7c8d79b09e 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -121,8 +121,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
   }
 
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
 
 #undef HIGH_GET_VAR
 
@@ -167,16 +167,16 @@ HIGH_GET_VAR(8);
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
 
 #undef VAR_FN
 
@@ -255,10 +255,10 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
       const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
       unsigned int *sse, void *unused0, void *unused);
 #define DECLS(opt) \
-  DECL(8, opt);    \
+  DECL(8, opt)     \
   DECL(16, opt)
 
-DECLS(sse2);
+DECLS(sse2)
 
 #undef DECLS
 #undef DECL
@@ -383,20 +383,20 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));
-
-FNS(sse2);
+#define FNS(opt)                       \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))
+
+FNS(sse2)
 
 #undef FNS
 #undef FN
@@ -412,7 +412,7 @@ FNS(sse2);
   DECL(16, opt1)    \
   DECL(8, opt1)
 
-DECLS(sse2);
+DECLS(sse2)
 #undef DECL
 #undef DECLS
 
@@ -542,20 +542,20 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt1)                        \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
-  FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-FNS(sse2);
+#define FNS(opt1)                       \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int64_t))  \
+  FN(8, 16, 8, 4, 3, opt1, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (int64_t))
+
+FNS(sse2)
 
 #undef FNS
 #undef FN
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index d944134305..3b48acd510 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -71,17 +71,17 @@
     return res;                                                               \
   }
 
-#define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
+#define FSAD64 \
+  FSAD64_H(64) \
+  FSAD64_H(32)
 
-#define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
+#define FSAD32 \
+  FSAD32_H(64) \
+  FSAD32_H(32) \
+  FSAD32_H(16)
 
-FSAD64;
-FSAD32;
+FSAD64
+FSAD32
 
 #undef FSAD64
 #undef FSAD32
@@ -160,17 +160,17 @@ FSAD32;
     return res;                                                               \
   }
 
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
+#define FSADAVG64 \
+  FSADAVG64_H(64) \
+  FSADAVG64_H(32)
 
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
+#define FSADAVG32 \
+  FSADAVG32_H(64) \
+  FSADAVG32_H(32) \
+  FSADAVG32_H(16)
 
-FSADAVG64;
-FSADAVG32;
+FSADAVG64
+FSADAVG32
 
 #undef FSADAVG64
 #undef FSADAVG32
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 67645c57ac..a67c92aadb 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -471,23 +471,23 @@ DECLS(ssse3, ssse3);
            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t))     \
   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
 
-FNS(sse2, sse2);
-FNS(ssse3, ssse3);
+FNS(sse2, sse2)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
@@ -543,23 +543,23 @@ DECLS(ssse3, ssse3);
            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t))  \
+  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t))    \
+  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t))    \
   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
 
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
+FNS(sse2, sse)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 2391790284..0cbd151dc3 100644
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -1040,12 +1040,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
 //                                  const InterpKernel *filter, int x0_q4,
 //                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
-            sse2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+            sse2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
+            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
 
 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
@@ -1057,8 +1057,8 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                             const InterpKernel *filter, int x0_q4,
 //                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
-FUN_CONV_2D(, sse2, 0);
-FUN_CONV_2D(avg_, sse2, 1);
+FUN_CONV_2D(, sse2, 0)
+FUN_CONV_2D(avg_, sse2, 1)
 
 #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
@@ -1139,12 +1139,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), , sse2, 0);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+                 src - src_stride * (num_taps / 2 - 1), , sse2, 0)
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
+                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
 
 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -1156,6 +1156,6 @@ HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                                    const InterpKernel *filter, int x0_q4,
 //                                    int32_t x_step_q4, int y0_q4,
 //                                    int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2, 0);
-HIGH_FUN_CONV_2D(avg_, sse2, 1);
+HIGH_FUN_CONV_2D(, sse2, 0)
+HIGH_FUN_CONV_2D(avg_, sse2, 1)
 #endif  // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 1eaa19bfc5..6f2983a4b5 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -969,12 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
-            avx2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+            avx2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -986,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, avx2, 0);
-FUN_CONV_2D(avg_, avx2, 1);
+FUN_CONV_2D(, avx2, 0)
+FUN_CONV_2D(avg_, avx2, 1)
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 77355a2085..ed46d6245d 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -731,12 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
-            ssse3, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+            ssse3, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
+            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1)
 
 static void filter_horiz_w8_ssse3(const uint8_t *const src,
                                   const ptrdiff_t src_stride,
@@ -1083,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, ssse3, 0);
-FUN_CONV_2D(avg_, ssse3, 1);
+FUN_CONV_2D(, ssse3, 0)
+FUN_CONV_2D(avg_, ssse3, 1)
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 651ff64606..795fb2923f 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -47,7 +47,7 @@ typedef enum {
 #define cpuid(func, func2, ax, bx, cx, dx)                      \
   __asm__ __volatile__("cpuid           \n\t"                   \
                        : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
-                       : "a"(func), "c"(func2));
+                       : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   __asm__ __volatile__(                        \
@@ -55,7 +55,7 @@ typedef enum {
       "cpuid              \n\t"                \
       "xchg %%edi, %%ebx  \n\t"                \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #elif defined(__SUNPRO_C) || \
     defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
@@ -67,7 +67,7 @@ typedef enum {
       "movl %ebx, %edi \n\t"                   \
       "xchg %rsi, %rbx \n\t"                   \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   asm volatile(                                \
@@ -76,7 +76,7 @@ typedef enum {
       "movl %ebx, %edi  \n\t"                  \
       "popl %ebx        \n\t"                  \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #else /* end __SUNPRO__ */
 #if VPX_ARCH_X86_64

From ab35ee100a38347433af24df05a5e1578172a2ae Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 7 Dec 2021 13:11:46 -0800
Subject: [PATCH 181/926] clear -Wextra-semi/-Wextra-semi-stmt warnings x2

some additional neon file updates after:
31b954deb clear -Wextra-semi/-Wextra-semi-stmt warnings

Bug: chromium:1257449
Change-Id: I3e2664f2bd8f6f7328ec91bf6595ba5fc09862bd
---
 vpx_dsp/arm/fdct32x32_neon.c         |  2 +-
 vpx_dsp/arm/sad_neon.c               | 30 +++++++--------
 vpx_dsp/arm/subpel_variance_neon.c   | 56 ++++++++++++++--------------
 vpx_dsp/arm/variance_neon.c          | 22 +++++------
 vpx_dsp/arm/vpx_convolve8_neon_asm.c |  8 ++--
 vpx_dsp/arm/vpx_convolve8_neon_asm.h | 16 ++++----
 6 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index e9cd34904b..de74e6630b 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -153,7 +153,7 @@ static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
   do {                                        \
     store_s16q_to_tran_low(dest, src[index]); \
     dest += 8;                                \
-  } while (0);
+  } while (0)
 
 // Store 32 16x8 values, assuming stride == 32.
 // Slight twist: store horizontally in blocks of 8.
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 59567bda5b..b1509d883a 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -110,7 +110,7 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
   return abs;
 }
 
-#define sad8xN(n)                                                              \
+#define SAD8XN(n)                                                              \
   uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
                                const uint8_t *ref_ptr, int ref_stride) {       \
     const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
@@ -125,9 +125,9 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
     return horizontal_add_uint16x8(abs);                                       \
   }
 
-sad8xN(4);
-sad8xN(8);
-sad8xN(16);
+SAD8XN(4)
+SAD8XN(8)
+SAD8XN(16)
 
 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
@@ -167,7 +167,7 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
   return abs;
 }
 
-#define sad16xN(n)                                                            \
+#define SAD16XN(n)                                                            \
   uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint16x8_t abs =                                                    \
@@ -183,9 +183,9 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
     return horizontal_add_uint16x8(abs);                                      \
   }
 
-sad16xN(8);
-sad16xN(16);
-sad16xN(32);
+SAD16XN(8)
+SAD16XN(16)
+SAD16XN(32)
 
 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
@@ -235,7 +235,7 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
   return abs;
 }
 
-#define sad32xN(n)                                                            \
+#define SAD32XN(n)                                                            \
   uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint16x8_t abs =                                                    \
@@ -251,9 +251,9 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
     return horizontal_add_uint16x8(abs);                                      \
   }
 
-sad32xN(16);
-sad32xN(32);
-sad32xN(64);
+SAD32XN(16)
+SAD32XN(32)
+SAD32XN(64)
 
 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
@@ -333,7 +333,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
   }
 }
 
-#define sad64xN(n)                                                            \
+#define SAD64XN(n)                                                            \
   uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint32x4_t abs =                                                    \
@@ -349,5 +349,5 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
     return horizontal_add_uint32x4(abs);                                      \
   }
 
-sad64xN(32);
-sad64xN(64);
+SAD64XN(32)
+SAD64XN(64)
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 37bfd1cd1f..a3befdc348 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -97,7 +97,7 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.
-#define sub_pixel_varianceNxM(n, m)                                         \
+#define SUB_PIXEL_VARIANCENXM(n, m)                                         \
   uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                          \
       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {              \
@@ -123,23 +123,23 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
     return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse);       \
   }
 
-sub_pixel_varianceNxM(4, 4);
-sub_pixel_varianceNxM(4, 8);
-sub_pixel_varianceNxM(8, 4);
-sub_pixel_varianceNxM(8, 8);
-sub_pixel_varianceNxM(8, 16);
-sub_pixel_varianceNxM(16, 8);
-sub_pixel_varianceNxM(16, 16);
-sub_pixel_varianceNxM(16, 32);
-sub_pixel_varianceNxM(32, 16);
-sub_pixel_varianceNxM(32, 32);
-sub_pixel_varianceNxM(32, 64);
-sub_pixel_varianceNxM(64, 32);
-sub_pixel_varianceNxM(64, 64);
+SUB_PIXEL_VARIANCENXM(4, 4)
+SUB_PIXEL_VARIANCENXM(4, 8)
+SUB_PIXEL_VARIANCENXM(8, 4)
+SUB_PIXEL_VARIANCENXM(8, 8)
+SUB_PIXEL_VARIANCENXM(8, 16)
+SUB_PIXEL_VARIANCENXM(16, 8)
+SUB_PIXEL_VARIANCENXM(16, 16)
+SUB_PIXEL_VARIANCENXM(16, 32)
+SUB_PIXEL_VARIANCENXM(32, 16)
+SUB_PIXEL_VARIANCENXM(32, 32)
+SUB_PIXEL_VARIANCENXM(32, 64)
+SUB_PIXEL_VARIANCENXM(64, 32)
+SUB_PIXEL_VARIANCENXM(64, 64)
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.
-#define sub_pixel_avg_varianceNxM(n, m)                                     \
+#define SUB_PIXEL_AVG_VARIANCENXM(n, m)                                     \
   uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(                      \
       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                \
@@ -169,16 +169,16 @@ sub_pixel_varianceNxM(64, 64);
     return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse);       \
   }
 
-sub_pixel_avg_varianceNxM(4, 4);
-sub_pixel_avg_varianceNxM(4, 8);
-sub_pixel_avg_varianceNxM(8, 4);
-sub_pixel_avg_varianceNxM(8, 8);
-sub_pixel_avg_varianceNxM(8, 16);
-sub_pixel_avg_varianceNxM(16, 8);
-sub_pixel_avg_varianceNxM(16, 16);
-sub_pixel_avg_varianceNxM(16, 32);
-sub_pixel_avg_varianceNxM(32, 16);
-sub_pixel_avg_varianceNxM(32, 32);
-sub_pixel_avg_varianceNxM(32, 64);
-sub_pixel_avg_varianceNxM(64, 32);
-sub_pixel_avg_varianceNxM(64, 64);
+SUB_PIXEL_AVG_VARIANCENXM(4, 4)
+SUB_PIXEL_AVG_VARIANCENXM(4, 8)
+SUB_PIXEL_AVG_VARIANCENXM(8, 4)
+SUB_PIXEL_AVG_VARIANCENXM(8, 8)
+SUB_PIXEL_AVG_VARIANCENXM(8, 16)
+SUB_PIXEL_AVG_VARIANCENXM(16, 8)
+SUB_PIXEL_AVG_VARIANCENXM(16, 16)
+SUB_PIXEL_AVG_VARIANCENXM(16, 32)
+SUB_PIXEL_AVG_VARIANCENXM(32, 16)
+SUB_PIXEL_AVG_VARIANCENXM(32, 32)
+SUB_PIXEL_AVG_VARIANCENXM(32, 64)
+SUB_PIXEL_AVG_VARIANCENXM(64, 32)
+SUB_PIXEL_AVG_VARIANCENXM(64, 64)
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 410ce7d9e6..7b93f142b1 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -268,7 +268,7 @@ void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
   variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
 }
 
-#define varianceNxM(n, m, shift)                                             \
+#define VARIANCENXM(n, m, shift)                                             \
   unsigned int vpx_variance##n##x##m##_neon(                                 \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
       int ref_stride, unsigned int *sse) {                                   \
@@ -288,16 +288,16 @@ void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
       return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
   }
 
-varianceNxM(4, 4, 4);
-varianceNxM(4, 8, 5);
-varianceNxM(8, 4, 5);
-varianceNxM(8, 8, 6);
-varianceNxM(8, 16, 7);
-varianceNxM(16, 8, 7);
-varianceNxM(16, 16, 8);
-varianceNxM(16, 32, 9);
-varianceNxM(32, 16, 9);
-varianceNxM(32, 32, 10);
+VARIANCENXM(4, 4, 4)
+VARIANCENXM(4, 8, 5)
+VARIANCENXM(8, 4, 5)
+VARIANCENXM(8, 8, 6)
+VARIANCENXM(8, 16, 7)
+VARIANCENXM(16, 8, 7)
+VARIANCENXM(16, 16, 8)
+VARIANCENXM(16, 32, 9)
+VARIANCENXM(32, 16, 9)
+VARIANCENXM(32, 32, 10)
 
 unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride,
                                     const uint8_t *ref_ptr, int ref_stride,
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/vpx_dsp/arm/vpx_convolve8_neon_asm.c
index 4470b28b88..c4177c5385 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_asm.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -35,7 +35,7 @@
     }                                                                        \
   }
 
-DEFINE_FILTER(horiz);
-DEFINE_FILTER(avg_horiz);
-DEFINE_FILTER(vert);
-DEFINE_FILTER(avg_vert);
+DEFINE_FILTER(horiz)
+DEFINE_FILTER(avg_horiz)
+DEFINE_FILTER(vert)
+DEFINE_FILTER(avg_vert)
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/vpx_dsp/arm/vpx_convolve8_neon_asm.h
index b123d1cb08..f1c7d62ed0 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_asm.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -17,13 +17,13 @@
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
       int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-DECLARE_FILTER(horiz, type1);
-DECLARE_FILTER(avg_horiz, type1);
-DECLARE_FILTER(horiz, type2);
-DECLARE_FILTER(avg_horiz, type2);
-DECLARE_FILTER(vert, type1);
-DECLARE_FILTER(avg_vert, type1);
-DECLARE_FILTER(vert, type2);
-DECLARE_FILTER(avg_vert, type2);
+DECLARE_FILTER(horiz, type1)
+DECLARE_FILTER(avg_horiz, type1)
+DECLARE_FILTER(horiz, type2)
+DECLARE_FILTER(avg_horiz, type2)
+DECLARE_FILTER(vert, type1)
+DECLARE_FILTER(avg_vert, type1)
+DECLARE_FILTER(vert, type2)
+DECLARE_FILTER(avg_vert, type2)
 
 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_

From 69146697b51d3344e8dfe7608cd141699e1f6a59 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 8 Dec 2021 19:34:47 -0800
Subject: [PATCH 182/926] vp9_thread_test.cc: remove incorrect TODO

the row-based loop filter is ok (and being used) in this case; since
it's serialized the previous row will always be done

Change-Id: I024a0c78e7488178956cc22a4c4680a00dc6eade
---
 test/vp9_thread_test.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 352ad71eca..5cac9ea0ee 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -216,10 +216,6 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) {
   static const VPxWorkerInterface serial_interface = {
     impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
   };
-  // TODO(jzern): Avoid using a file that will use the row-based thread
-  // loopfilter, with the simple serialized implementation it will hang. This is
-  // due to its expectation that rows will be run in parallel as they wait on
-  // progress in the row above before proceeding.
   static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
   static const char filename[] = "vp90-2-03-size-226x226.webm";
   VPxWorkerInterface default_interface = *vpx_get_worker_interface();

From 093a8c4824729be62e582b7d3f00e18830aee3b8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 8 Dec 2021 21:35:26 -0800
Subject: [PATCH 183/926] test_intra_pred_speed: match above ext w/reconintra

only 2 x block_size is needed

+ remove a related TODO; C & assembly rely on this extension

Change-Id: Iea430267624251cccbbdaec8045eb81d01ae1db1
---
 test/test_intra_pred_speed.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 08100a146e..28b3484a03 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -48,11 +48,9 @@ struct IntraPredTestMem {
     for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
     for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
 
-    // some code assumes the top row has been extended:
-    // d45/d63 C-code, for instance, but not the assembly.
-    // TODO(jzern): this style of extension isn't strictly necessary.
+    // d45/d63 require the top row to be extended.
     ASSERT_LE(block_size, kBPS);
-    for (int i = block_size; i < 2 * kBPS; ++i) {
+    for (int i = block_size; i < 2 * block_size; ++i) {
       above[i] = above[block_size - 1];
     }
   }

From 7fbcee49da63a61feee00147746efa33e31087e8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 8 Dec 2021 21:42:28 -0800
Subject: [PATCH 184/926] quiet -Warray-parameter warnings

w/gcc-11
this matches the definition of the function with the declaration

Change-Id: I757b731b9560cb0b0ceec4ec258ec5af5a183b3d
---
 vpx_dsp/avg.c              | 2 +-
 vpx_dsp/x86/sad4d_avx2.c   | 8 ++++----
 vpx_dsp/x86/sad4d_avx512.c | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 1c45e8a73d..c87ab20ffe 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -340,7 +340,7 @@ int vpx_satd_c(const tran_low_t *coeff, int length) {
 
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64}.
-void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
+void vpx_int_pro_row_c(int16_t *hbuf /*[16]*/, const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index a5c4f8c537..9dd0666918 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -22,8 +22,8 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
 }
 
 void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+                          const uint8_t *const ref_array[/*4*/], int ref_stride,
+                          uint32_t *sad_array /*[4]*/) {
   int i;
   const uint8_t *refs[4];
   __m256i sums[4];
@@ -127,8 +127,8 @@ void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
 }
 
 void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+                          const uint8_t *const ref_array[/*4*/], int ref_stride,
+                          uint32_t *sad_array /*[4]*/) {
   __m256i sums[4];
   int i;
   const uint8_t *refs[4];
diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c
index 4c5d70464d..2fa9108718 100644
--- a/vpx_dsp/x86/sad4d_avx512.c
+++ b/vpx_dsp/x86/sad4d_avx512.c
@@ -12,8 +12,8 @@
 #include "vpx/vpx_integer.h"
 
 void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t res[4]) {
+                            const uint8_t *const ref_array[/*4*/],
+                            int ref_stride, uint32_t *res /*[4]*/) {
   __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   __m512i sum_mlow, sum_mhigh;

From f3e2a690cd474eae47376b431f5bddf6d73e377c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 Dec 2021 17:32:30 -0800
Subject: [PATCH 185/926] vp9_bitstream.c: quiet -Wstringop-overflow warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

w/gcc-11

as noted in
the size of interp_filter_selected[][]'s first dimension varies between
VP9_COMP and VP9BitstreamWorkerData as noted in the latter's definition:
  // The size of interp_filter_selected in VP9_COMP is actually
  // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do
  // is increment the very first index (index 0) for the first dimension. Hence
  // this is sufficient.
  int interp_filter_selected[1][SWITCHABLE];

normalize the function signatures of write_modes*(), etc. to take this
into account.

vp9/encoder/vp9_bitstream.c|948 col 3| warning: ‘write_modes’ accessing
64 bytes in a region of size 16 [-Wstringop-overflow=]
||   948 |   write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
||       |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
||   949 |               &data->bit_writer, tile_row, data->tile_idx,
||       |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
||   950 |               &data->max_mv_magnitude, data->interp_filter_selected);
||       |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
vp9/encoder/vp9_bitstream.c|948 col 3| note: referencing argument 8 of
type ‘int (*)[4]’
vp9/encoder/vp9_bitstream.c|488 col 13| note: in a call to function
‘write_modes’

Change-Id: I0898cd7c3431633c382a0c3a1be2f0a0bea8d0f9
---
 vp9/encoder/vp9_bitstream.c | 42 ++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c23e150a45..d4ac4ffc79 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -236,11 +236,11 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
   }
 }
 
-static void pack_inter_mode_mvs(
-    VP9_COMP *cpi, const MACROBLOCKD *const xd,
-    const MB_MODE_INFO_EXT *const mbmi_ext, vpx_writer *w,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd,
+                                const MB_MODE_INFO_EXT *const mbmi_ext,
+                                vpx_writer *w,
+                                unsigned int *const max_mv_magnitude,
+                                int interp_filter_selected[][SWITCHABLE]) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
   const struct segmentation *const seg = &cm->seg;
@@ -373,11 +373,12 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]);
 }
 
-static void write_modes_b(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
-    int mi_row, int mi_col, unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                          const TileInfo *const tile, vpx_writer *w,
+                          TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                          int mi_row, int mi_col,
+                          unsigned int *const max_mv_magnitude,
+                          int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   const MB_MODE_INFO_EXT *const mbmi_ext =
       cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
@@ -422,12 +423,12 @@ static void write_partition(const VP9_COMMON *const cm,
   }
 }
 
-static void write_modes_sb(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
-    int mi_row, int mi_col, BLOCK_SIZE bsize,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, vpx_writer *w,
+                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize,
+                           unsigned int *const max_mv_magnitude,
+                           int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
@@ -485,11 +486,10 @@ static void write_modes_sb(
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, int tile_row, int tile_col,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                        const TileInfo *const tile, vpx_writer *w, int tile_row,
+                        int tile_col, unsigned int *const max_mv_magnitude,
+                        int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   int mi_row, mi_col, tile_sb_row;
   TOKENEXTRA *tok = NULL;

From 3cff8be3d8b662d96d49b01a53e4fa22278709b2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 Dec 2021 18:02:30 -0800
Subject: [PATCH 186/926] vp9_diamond_search_sad_avx: quiet
 -Wmaybe-uninitialized warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

w/gcc-11

v_these_mv_w is always initialized in this block with _mm_add_epi16();
converting this to a _mm_storeu_si32(tmp) call also works, but
introduces more stack usage

|| ../vp9/encoder/x86/vp9_diamond_search_sad_avx.c: In function
‘vp9_diamond_search_sad_avx’:
vp9/encoder/x86/vp9_diamond_search_sad_avx.c|285 col 19| warning:
‘v_these_mv_w’ may be used uninitialized [-Wmaybe-uninitialized]
||   285 |           new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
||       |           ~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
vp9/encoder/x86/vp9_diamond_search_sad_avx.c|149 col 21| note:
‘v_these_mv_w’ declared here
||   149 |       const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
||       |                     ^~~~~~~~~~~~

Change-Id: I1cd2fcb41030db16f51c94f3a70eb8eb2a526401
---
 vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 4be6a5ea02..fcf50eb2a7 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -282,7 +282,14 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
 
         // Update the global minimum if the local minimum is smaller
         if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
           new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
           new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
 
           best_sad = local_best_sad;

From 03a81068467076b4ce4a41dafaac9a9e5cc5f01c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 Dec 2021 18:34:18 -0800
Subject: [PATCH 187/926] vp[89]_initalize_enc(): protect against multiple
 invocations

this removes the burden from callers; the rtcd functions are left with a
mostly redundant (outside of tests) once() as top-level functions should
ensure their constraints are met

Change-Id: I5bdbcfa4671c6a1492cfe9c7d886c361c26caaa9
---
 vp8/encoder/onyx_if.c     | 14 ++++++--------
 vp8/vp8_cx_iface.c        |  3 +--
 vp9/common/vp9_rtcd.c     |  6 +-----
 vp9/encoder/vp9_encoder.c | 26 ++++++++++++--------------
 vp9/vp9_cx_iface.c        |  3 +--
 vpx_ports/vpx_once.h      |  4 ++--
 6 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 59bce951e0..f09177c7f5 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -36,6 +36,7 @@
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_util/vpx_write_yuv_frame.h"
 #if VPX_ARCH_ARM
@@ -394,16 +395,13 @@ static void setup_features(VP8_COMP *cpi) {
 
 static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
 
-void vp8_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    vpx_dsp_rtcd();
-    vp8_init_intra_predictors();
-    init_done = 1;
-  }
+static void initialize_enc(void) {
+  vpx_dsp_rtcd();
+  vp8_init_intra_predictors();
 }
 
+void vp8_initialize_enc(void) { once(initialize_enc); }
+
 static void dealloc_compressor_data(VP8_COMP *cpi) {
   vpx_free(cpi->tplist);
   cpi->tplist = NULL;
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index ab954c46f2..21fed0e8ed 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -18,7 +18,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
-#include "vpx_ports/vpx_once.h"
 #include "vpx_util/vpx_timestamp.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
@@ -694,7 +693,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
       ctx->priv->enc.total_encoders = 1;
     }
 
-    once(vp8_initialize_enc);
+    vp8_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
 
diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c
index d8c870aa3f..37762ca15a 100644
--- a/vp9/common/vp9_rtcd.c
+++ b/vp9/common/vp9_rtcd.c
@@ -12,8 +12,4 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp9_rtcd() {
-  // TODO(JBB): Remove this once, by insuring that both the encoder and
-  // decoder setup functions are protected by once();
-  once(setup_rtcd_internal);
-}
+void vp9_rtcd() { once(setup_rtcd_internal); }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8fdd86916f..8d5ec5a360 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -25,6 +25,7 @@
 #endif
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
@@ -929,24 +930,21 @@ static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
 }
 
-void vp9_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    vp9_rtcd();
-    vpx_dsp_rtcd();
-    vpx_scale_rtcd();
-    vp9_init_intra_predictors();
-    vp9_init_me_luts();
-    vp9_rc_init_minq_luts();
-    vp9_entropy_mv_init();
+static void initialize_enc(void) {
+  vp9_rtcd();
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+  vp9_init_intra_predictors();
+  vp9_init_me_luts();
+  vp9_rc_init_minq_luts();
+  vp9_entropy_mv_init();
 #if !CONFIG_REALTIME_ONLY
-    vp9_temporal_filter_init();
+  vp9_temporal_filter_init();
 #endif
-    init_done = 1;
-  }
 }
 
+void vp9_initialize_enc(void) { once(initialize_enc); }
+
 static void dealloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int i;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index cc4081c4f5..9f03ed1728 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -15,7 +15,6 @@
 #include "vpx/vpx_encoder.h"
 #include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/psnr.h"
-#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_util/vpx_timestamp.h"
@@ -1096,7 +1095,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     }
 
     priv->extra_cfg = default_extra_cfg;
-    once(vp9_initialize_enc);
+    vp9_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index 4eb592b87e..d8a8ed89fe 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -95,7 +95,7 @@ static void once(void (*func)(void)) {
 #define INCL_DOS
 #include <os2.h>
 static void once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   /* If the initialization is complete, return early. */
   if (done) return;
@@ -128,7 +128,7 @@ static void once(void (*func)(void)) {
  */
 
 static void once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   if (!done) {
     func();

From e7f33a53cf404bbb3688af9b13375b5c090daae4 Mon Sep 17 00:00:00 2001
From: Jianhui Dai <jianhui.j.dai@intel.com>
Date: Thu, 9 Dec 2021 13:38:22 +0800
Subject: [PATCH 188/926] Set unused reference frames to first ref

If a reference frame is not referenced, then set the index for that
reference to the first one used/referenced instead of unused slot.
Unused slot means key frame, as key frame resets all slots with itself.

This CL extracts `get_first_ref_frame()` from `reset_fb_idx_unused()`
with a typo fixing, and sets all unused reference frames to first ref in
vp9 uncompressed header.

Bug: webrtc:13442
Change-Id: I99523bc2ceedf27efe376d1113851ff342982181
---
 vp9/encoder/vp9_bitstream.c        | 17 ++++++---
 vp9/encoder/vp9_encoder.h          | 22 ++++++++----
 vp9/encoder/vp9_svc_layercontext.c | 55 ++++++++++++------------------
 vpx/vp8cx.h                        | 17 +++++----
 4 files changed, 61 insertions(+), 50 deletions(-)

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c23e150a45..3c4bdc9914 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1241,12 +1241,21 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
+      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                        VP9_ALT_FLAG };
+      const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi);
+      const int first_ref_map_idx = get_ref_frame_map_idx(cpi, first_ref);
       MV_REFERENCE_FRAME ref_frame;
       vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
-      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-        vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
-                             REF_FRAMES_LOG2);
+
+      // If a reference frame is not referenced, then set the index for that
+      // reference to the first one used/referenced.
+      for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
+        const int referenced = cpi->ref_frame_flags & flag_list[ref_frame];
+        const int map_idx = referenced ? get_ref_frame_map_idx(cpi, ref_frame)
+                                       : first_ref_map_idx;
+        assert(map_idx != INVALID_IDX);
+        vpx_wb_write_literal(wb, map_idx, REF_FRAMES_LOG2);
         vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
       }
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 9774a64ccf..1bca7ded75 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1196,14 +1196,24 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
+static INLINE MV_REFERENCE_FRAME get_first_ref_frame(VP9_COMP *const cpi) {
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
+  while (ref_frame < MAX_REF_FRAMES) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) break;
+    ref_frame++;
+  }
+  return ref_frame;
+}
+
 static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
-  if (ref_frame == LAST_FRAME) {
-    return cpi->lst_fb_idx;
-  } else if (ref_frame == GOLDEN_FRAME) {
-    return cpi->gld_fb_idx;
-  } else {
-    return cpi->alt_fb_idx;
+  switch (ref_frame) {
+    case LAST_FRAME: return cpi->lst_fb_idx;
+    case GOLDEN_FRAME: return cpi->gld_fb_idx;
+    case ALTREF_FRAME: return cpi->alt_fb_idx;
+    default: return INVALID_IDX;
   }
 }
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index ad3a8f7afa..f01cb17a2f 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -73,7 +73,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     svc->downsample_filter_type[sl] = BILINEAR;
     svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
     svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark;
-    svc->fb_idx_upd_tl0[sl] = -1;
+    svc->fb_idx_upd_tl0[sl] = INVALID_IDX;
     svc->drop_count[sl] = 0;
     svc->spatial_layer_sync[sl] = 0;
     svc->force_drop_constrained_from_above[sl] = 0;
@@ -462,32 +462,21 @@ static void reset_fb_idx_unused(VP9_COMP *const cpi) {
   // fb_idx for that reference to the first one used/referenced.
   // This is to avoid setting fb_idx for a reference to a slot that is not
   // used/needed (i.e., since that reference is not referenced or refreshed).
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
-  MV_REFERENCE_FRAME ref_frame;
-  MV_REFERENCE_FRAME first_ref = 0;
-  int first_fb_idx = 0;
-  int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      first_ref = ref_frame;
-      first_fb_idx = fb_idx[ref_frame - 1];
-      break;
+  const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi);
+  const int map_idx = get_ref_frame_map_idx(cpi, first_ref);
+  if (map_idx != INVALID_IDX) {
+    if (!(cpi->ref_frame_flags & VP9_LAST_FLAG ||
+          cpi->ext_refresh_last_frame)) {
+      cpi->lst_fb_idx = map_idx;
+    }
+    if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG ||
+          cpi->ext_refresh_golden_frame)) {
+      cpi->gld_fb_idx = map_idx;
+    }
+    if (!(cpi->ref_frame_flags & VP9_ALT_FLAG ||
+          cpi->ext_refresh_alt_ref_frame)) {
+      cpi->alt_fb_idx = map_idx;
     }
-  }
-  if (first_ref > 0) {
-    if (first_ref != LAST_FRAME &&
-        !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) &&
-        !cpi->ext_refresh_last_frame)
-      cpi->lst_fb_idx = first_fb_idx;
-    else if (first_ref != GOLDEN_FRAME &&
-             !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
-             !cpi->ext_refresh_golden_frame)
-      cpi->gld_fb_idx = first_fb_idx;
-    else if (first_ref != ALTREF_FRAME &&
-             !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) &&
-             !cpi->ext_refresh_alt_ref_frame)
-      cpi->alt_fb_idx = first_fb_idx;
   }
 }
 
@@ -716,9 +705,9 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
   int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode;
   cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl];
   cpi->ext_refresh_frame_flags_pending = 1;
-  cpi->lst_fb_idx = svc->lst_fb_idx[sl];
-  cpi->gld_fb_idx = svc->gld_fb_idx[sl];
-  cpi->alt_fb_idx = svc->alt_fb_idx[sl];
+  if (svc->reference_last[sl]) cpi->lst_fb_idx = svc->lst_fb_idx[sl];
+  if (svc->reference_golden[sl]) cpi->gld_fb_idx = svc->gld_fb_idx[sl];
+  if (svc->reference_altref[sl]) cpi->alt_fb_idx = svc->alt_fb_idx[sl];
   cpi->ext_refresh_last_frame = 0;
   cpi->ext_refresh_golden_frame = 0;
   cpi->ext_refresh_alt_ref_frame = 0;
@@ -875,9 +864,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     // flags are passed via the encode call (bypass mode). Issue is that we're
     // resetting ext_refresh_frame_flags_pending to 0 on frame drops.
     if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-      memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx));
-      memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx));
-      memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      memset(&svc->lst_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx));
+      memset(&svc->gld_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx));
+      memset(&svc->alt_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx));
       // These are set by API before the superframe is encoded and they are
       // passed to encoder layer by layer. Don't reset them on layer 0 in bypass
       // mode.
@@ -970,7 +959,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
 
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
       svc->last_layer_dropped[svc->spatial_layer_id] &&
-      svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 &&
+      svc->fb_idx_upd_tl0[svc->spatial_layer_id] != INVALID_IDX &&
       !svc->layer_context[svc->temporal_layer_id].is_key_frame) {
     // For fixed/non-flexible mode, if the previous frame (same spatial layer
     // from previous superframe) was dropped, make sure the lst_fb_idx
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 47c38d3b5e..b3c50a9b67 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -897,13 +897,16 @@ typedef struct vpx_svc_ref_frame_config {
   int alt_fb_idx[VPX_SS_MAX_LAYERS];         /**< Altref buffer index. */
   int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */
   // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated.
-  int update_last[VPX_SS_MAX_LAYERS];       /**< Update last. */
-  int update_golden[VPX_SS_MAX_LAYERS];     /**< Update golden. */
-  int update_alt_ref[VPX_SS_MAX_LAYERS];    /**< Update altref. */
-  int reference_last[VPX_SS_MAX_LAYERS];    /**< Last as reference. */
-  int reference_golden[VPX_SS_MAX_LAYERS];  /**< Golden as reference. */
-  int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */
-  int64_t duration[VPX_SS_MAX_LAYERS];      /**< Duration per spatial layer. */
+  int update_last[VPX_SS_MAX_LAYERS];    /**< Update last. */
+  int update_golden[VPX_SS_MAX_LAYERS];  /**< Update golden. */
+  int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */
+  int reference_last[VPX_SS_MAX_LAYERS];
+  /**< Last as reference. Use first referenced index if FALSE. */
+  int reference_golden[VPX_SS_MAX_LAYERS];
+  /**< Golden as reference. Use first referenced index if FALSE. */
+  int reference_alt_ref[VPX_SS_MAX_LAYERS];
+  /**< Altref as reference. Use first referenced index if FALSE. */
+  int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */
 } vpx_svc_ref_frame_config_t;
 
 /*!\brief VP9 svc frame dropping mode.

From ea042a676ee09987dc5c8fccaef6ea941eaea258 Mon Sep 17 00:00:00 2001
From: Fyodor Kyslov <kyslov@google.com>
Date: Tue, 14 Dec 2021 09:59:17 -0800
Subject: [PATCH 189/926] vp9 encoder: fix integer overflows

fixing integer overflow with 16K content and enabling the test

Bug: webm:1750
Fixed: webm:1750
Change-Id: I76eebd915bcae55bc755613251a98e1716dea4c0
---
 test/realtime_test.cc       |  4 +---
 vp9/encoder/vp9_bitstream.c |  6 +++---
 vp9/encoder/vp9_cost.h      |  5 ++---
 vp9/encoder/vp9_subexp.c    | 35 ++++++++++++++++++-----------------
 vp9/encoder/vp9_subexp.h    | 15 ++++++++-------
 5 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index ab2080a85d..107f2e224f 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -85,9 +85,7 @@ TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
 
 TEST_P(RealtimeTest, IntegerOverflowLarge) {
   if (IsVP9()) {
-    GTEST_SKIP() << "TODO(https://crbug.com/webm/1750): Enable this test after "
-                    "undefined sanitizer warnings are fixed.";
-    // TestIntegerOverflow(16384, 16384);
+    TestIntegerOverflow(16384, 16384);
   } else {
     GTEST_SKIP()
         << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):"
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c23e150a45..e23ca97730 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -563,7 +563,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
-                int s;
+                int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
@@ -600,7 +600,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 const vpx_prob upd = DIFF_UPDATE_PROB;
-                int s;
+                int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
@@ -636,7 +636,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                int s;
+                int64_t s;
                 int u = 0;
 
                 if (t == PIVOT_NODE) {
diff --git a/vp9/encoder/vp9_cost.h b/vp9/encoder/vp9_cost.h
index 638d72a916..ee0033fa31 100644
--- a/vp9/encoder/vp9_cost.h
+++ b/vp9/encoder/vp9_cost.h
@@ -29,9 +29,8 @@ extern const uint16_t vp9_prob_cost[256];
 
 #define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) : (prob))
 
-static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          vpx_prob p) {
-  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
+static INLINE uint64_t cost_branch256(const unsigned int ct[2], vpx_prob p) {
+  return (uint64_t)ct[0] * vp9_cost_zero(p) + (uint64_t)ct[1] * vp9_cost_one(p);
 }
 
 static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs, int bits,
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 19bbd5373f..661294ba04 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -114,19 +114,20 @@ void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
   encode_term_subexp(w, delp);
 }
 
-int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
-                                        vpx_prob *bestp, vpx_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  int bestsavings = 0;
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                            vpx_prob oldp, vpx_prob *bestp,
+                                            vpx_prob upd) {
+  const int64_t old_b = cost_branch256(ct, oldp);
+  int64_t bestsavings = 0;
   vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
   const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
 
   if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
     for (newp = *bestp; newp != oldp; newp += step) {
-      const int new_b = cost_branch256(ct, newp);
-      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
-      const int savings = old_b - new_b - update_b;
+      const int64_t new_b = cost_branch256(ct, newp);
+      const int64_t update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      const int64_t savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
         bestnewp = newp;
@@ -137,15 +138,15 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
   return bestsavings;
 }
 
-int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob oldp,
-                                              vpx_prob *bestp, vpx_prob upd,
-                                              int stepsize) {
-  int i, old_b, new_b, update_b, savings, bestsavings;
-  int newp;
-  const int step_sign = *bestp > oldp ? -1 : 1;
-  const int step = stepsize * step_sign;
-  const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                                  const vpx_prob oldp,
+                                                  vpx_prob *bestp, vpx_prob upd,
+                                                  int stepsize) {
+  int64_t i, old_b, new_b, update_b, savings, bestsavings;
+  int64_t newp;
+  const int64_t step_sign = *bestp > oldp ? -1 : 1;
+  const int64_t step = stepsize * step_sign;
+  const int64_t upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
   const vpx_prob *newplist, *oldplist;
   vpx_prob bestnewp;
   oldplist = vp9_pareto8_full[oldp - 1];
@@ -182,7 +183,7 @@ void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]) {
   const vpx_prob upd = DIFF_UPDATE_PROB;
   vpx_prob newp = get_binary_prob(ct[0], ct[1]);
-  const int savings =
+  const int64_t savings =
       vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
   assert(newp >= 1);
   if (savings > 0) {
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index f0d544b527..2d016d24c5 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -25,13 +25,14 @@ void vp9_write_prob_diff_update(struct vpx_writer *w, vpx_prob newp,
 void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]);
 
-int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
-                                        vpx_prob *bestp, vpx_prob upd);
-
-int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob oldp,
-                                              vpx_prob *bestp, vpx_prob upd,
-                                              int stepsize);
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                            vpx_prob oldp, vpx_prob *bestp,
+                                            vpx_prob upd);
+
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                                  const vpx_prob oldp,
+                                                  vpx_prob *bestp, vpx_prob upd,
+                                                  int stepsize);
 
 #ifdef __cplusplus
 }  // extern "C"

From 6bf761a7ef8e8532d8a88c95d255a2873077de5d Mon Sep 17 00:00:00 2001
From: Fyodor Kyslov <kyslov@google.com>
Date: Wed, 15 Dec 2021 23:11:15 -0800
Subject: [PATCH 190/926] vp9 encoder: fix test failure on 32 bit arch

test fails with memory error. Reducing testing resolution

bug: webm:1750
Change-Id: I75664088022aa660bdf6e69de2d11121db44716f
---
 test/realtime_test.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index 107f2e224f..853b942824 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -85,7 +85,11 @@ TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
 
 TEST_P(RealtimeTest, IntegerOverflowLarge) {
   if (IsVP9()) {
+#if VPX_ARCH_X86_64
     TestIntegerOverflow(16384, 16384);
+#else
+    TestIntegerOverflow(4096, 4096);
+#endif
   } else {
     GTEST_SKIP()
         << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):"

From 94972ca7ea2a93806d0f902dd222ec164e81997d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 20 Dec 2021 19:56:01 -0800
Subject: [PATCH 191/926] vpx_int_pro_row: normalize declaration w/aom

this is a followup to:
  7fbcee49d quiet -Warray-parameter warnings
and conforms to aom in:
  06e13e817 quiet -Warray-parameter warnings

the sad functions are more varied in libvpx and will require a separate
pass

Change-Id: I765fd6704df615e836ba0b184ff8266ce926c394
---
 vpx_dsp/avg.c                 | 2 +-
 vpx_dsp/vpx_dsp_rtcd_defs.pl  | 2 +-
 vpx_dsp/x86/avg_intrin_sse2.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index c87ab20ffe..1c45e8a73d 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -340,7 +340,7 @@ int vpx_satd_c(const tran_low_t *coeff, int length) {
 
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64}.
-void vpx_int_pro_row_c(int16_t *hbuf /*[16]*/, const uint8_t *ref,
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index fd7eefdad0..0144b90c26 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -824,7 +824,7 @@ ()
     specialize qw/vpx_satd avx2 sse2 neon msa/;
   }
 
-  add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
+  add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
   specialize qw/vpx_int_pro_row sse2 neon msa/;
 
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 3cba258f61..9da2f34c9b 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -464,7 +464,7 @@ int vpx_satd_sse2(const tran_low_t *coeff, int length) {
   return _mm_cvtsi128_si32(accum);
 }
 
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();

From b685d6f02f1e3ccad9b1debd8dcf4c7a06bfaab6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 21 Dec 2021 11:53:51 -0800
Subject: [PATCH 192/926] vp9_prob_diff_update_savings_search_model: quiet conv
 warnings

under Visual Studio:
Warning C4244 '=': conversion from 'int64_t' to 'vpx_prob', possible loss of
data

after:
ea042a676 vp9 encoder: fix integer overflows

'newp' has already been range checked earlier in the loop so the cast won't
have any unexpected results

Change-Id: Ic10877db2c0633d53fffdf8852d5095403c23a02
---
 vp9/encoder/vp9_subexp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 661294ba04..3953253dbb 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -163,14 +163,14 @@ int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
     for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
       if (newp < 1 || newp > 255) continue;
       newplist = vp9_pareto8_full[newp - 1];
-      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+      new_b = cost_branch256(ct + 2 * PIVOT_NODE, (vpx_prob)newp);
       for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
         new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
-      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      update_b = prob_diff_update_cost((vpx_prob)newp, oldp) + upd_cost;
       savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
-        bestnewp = newp;
+        bestnewp = (vpx_prob)newp;
       }
     }
   }

From 44e611482e13fdffa0acde780a20dd68ee153498 Mon Sep 17 00:00:00 2001
From: Jianhui Dai <jianhui.j.dai@intel.com>
Date: Sat, 1 Jan 2022 08:01:48 +0800
Subject: [PATCH 193/926] Add vp9 ref frame to flag map function

Change-Id: I371c2346b9e0153c0f8053cab399ce14cd286c56
---
 vp9/encoder/vp9_bitstream.c        |  7 +++---
 vp9/encoder/vp9_encoder.c          |  4 +--
 vp9/encoder/vp9_encoder.h          | 15 ++++++++----
 vp9/encoder/vp9_pickmode.c         | 39 ++++++++++++++----------------
 vp9/encoder/vp9_rdopt.c            | 16 ++++++------
 vp9/encoder/vp9_speed_features.c   |  5 ++--
 vp9/encoder/vp9_svc_layercontext.c | 18 +++++---------
 7 files changed, 47 insertions(+), 57 deletions(-)

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 7644930c1c..c5145acf08 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1241,9 +1241,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
-      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                        VP9_ALT_FLAG };
-      const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi);
+      const int first_ref = get_first_ref_frame(cpi);
       const int first_ref_map_idx = get_ref_frame_map_idx(cpi, first_ref);
       MV_REFERENCE_FRAME ref_frame;
       vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
@@ -1251,7 +1249,8 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       // If a reference frame is not referenced, then set the index for that
       // reference to the first one used/referenced.
       for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
-        const int referenced = cpi->ref_frame_flags & flag_list[ref_frame];
+        const int referenced =
+            cpi->ref_frame_flags & ref_frame_to_flag(ref_frame);
         const int map_idx = referenced ? get_ref_frame_map_idx(cpi, ref_frame)
                                        : first_ref_map_idx;
         assert(map_idx != INVALID_IDX);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8d5ec5a360..1038bd9515 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -586,8 +586,6 @@ static void apply_roi_map(VP9_COMP *cpi) {
   int ref_frame[8];
   int internal_delta_q[MAX_SEGMENTS];
   int i;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
 
   // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
   // realtime mode.
@@ -628,7 +626,7 @@ static void apply_roi_map(VP9_COMP *cpi) {
         valid_ref = 0;
       // If GOLDEN is selected, make sure it's set as reference.
       if (ref_frame[i] == GOLDEN_FRAME &&
-          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
+          !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) {
         valid_ref = 0;
       }
       // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1bca7ded75..0059e881be 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1196,12 +1196,17 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
-static INLINE MV_REFERENCE_FRAME get_first_ref_frame(VP9_COMP *const cpi) {
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
-  MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
+static INLINE int ref_frame_to_flag(int8_t ref_frame) {
+  static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                          VP9_ALT_FLAG };
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+  return kVp9RefFlagList[ref_frame];
+}
+
+static INLINE int get_first_ref_frame(VP9_COMP *const cpi) {
+  int ref_frame = LAST_FRAME;
   while (ref_frame < MAX_REF_FRAMES) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) break;
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) break;
     ref_frame++;
   }
   return ref_frame;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 695fd484fc..c8e167f25b 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1247,7 +1247,7 @@ static INLINE void find_predictors(
     VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
     int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
-    const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
+    TileDataEnc *tile_data, int mi_row, int mi_col,
     struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
     int force_skip_low_temp_var, int comp_pred_allowed) {
   VP9_COMMON *const cm = &cpi->common;
@@ -1259,7 +1259,7 @@ static INLINE void find_predictors(
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
   // this needs various further optimizations. to be continued..
-  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+  if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
@@ -1690,8 +1690,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   RD_COST this_rdc, best_rdc;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
@@ -1925,14 +1923,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
       svc->spatial_layer_id > 0 && !gf_temporal_ref) {
-    if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
       struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[LAST_FRAME - 1] = 1;
         inter_layer_ref = LAST_FRAME;
       }
     }
-    if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
       struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
@@ -1957,7 +1955,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                                  cpi->rc.avg_frame_low_motion < 60))
     usable_ref_frame = LAST_FRAME;
 
-  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+  if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
 
@@ -1985,12 +1983,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     // Skip find_predictor if the reference frame is not in the
     // ref_frame_flags (i.e., not used as a reference for this frame).
     skip_ref_find_pred[ref_frame] =
-        !(cpi->ref_frame_flags & flag_list[ref_frame]);
+        !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame));
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
-                      &ref_frame_skip_mask, flag_list, tile_data, mi_row,
-                      mi_col, yv12_mb, bsize, force_skip_low_temp_var,
-                      comp_modes > 0);
+                      &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb,
+                      bsize, force_skip_low_temp_var, comp_modes > 0);
     }
   }
 
@@ -2014,7 +2011,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // than current layer: force check of GF-ZEROMV before early exit
   // due to skip flag.
   if (svc->spatial_layer_id > 0 && no_scaling &&
-      (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      (cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
       cm->base_qindex > svc->lower_layer_qindex + 10)
     force_test_gf_zeromv = 1;
 
@@ -2094,7 +2091,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
@@ -2107,7 +2105,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
          (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden)))
       continue;
 
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue;
 
     // For screen content. If zero_temp_sad source is computed: skip
     // non-zero motion check for stationary blocks. If the superblock is
@@ -2190,7 +2188,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         if (usable_ref_frame < ALTREF_FRAME) {
           if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
             i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-            if ((cpi->ref_frame_flags & flag_list[i]))
+            if ((cpi->ref_frame_flags & ref_frame_to_flag(i)))
               if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
                 ref_frame_skip_mask |= (1 << ref_frame);
           }
@@ -2199,9 +2197,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                      ref_frame == ALTREF_FRAME)) {
           int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
           int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+          if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-              ((cpi->ref_frame_flags & flag_list[ref2]) &&
+              ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
             ref_frame_skip_mask |= (1 << ref_frame);
         }
@@ -2488,7 +2486,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     perform_intra_pred =
         svc->temporal_layer_id == 0 ||
         svc->layer_context[svc->temporal_layer_id].is_key_frame ||
-        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
+        !(cpi->ref_frame_flags & VP9_GOLD_FLAG) ||
         (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
          svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
@@ -2747,8 +2745,6 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   MV_REFERENCE_FRAME best_ref_frame = NONE;
   unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   b_mode_info bsi[MAX_REF_FRAMES][4];
   int ref_frame_skip_mask = 0;
@@ -2764,7 +2760,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
     int_mv dummy_mv[2];
     x->pred_mv_sad[ref_frame] = INT_MAX;
 
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+        (yv12 != NULL)) {
       int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
       const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a1687dcf46..0171a05720 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3315,8 +3315,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3392,7 +3390,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) &&
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
         !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -3403,7 +3401,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped if both reference frames
       // are masked out.
@@ -3609,7 +3607,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         continue;
 
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
 
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
@@ -4140,8 +4139,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
@@ -4191,7 +4188,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     } else {
@@ -4276,7 +4273,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
           cm->ref_frame_sign_bias[second_ref_frame])
         continue;
 
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 81695e9156..7d7b2c3fb4 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -495,11 +495,10 @@ static void set_rt_speed_feature_framesize_independent(
         (cpi->external_resize == 1 ||
          cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
       MV_REFERENCE_FRAME ref_frame;
-      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                        VP9_ALT_FLAG };
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+        if (yv12 != NULL &&
+            (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
           const struct scale_factors *const scale_fac =
               &cm->frame_refs[ref_frame - 1].sf;
           if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index f01cb17a2f..30c17fd8e6 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -719,8 +719,6 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
 
 void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int sl = svc->spatial_layer_id;
   svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
   svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
@@ -743,12 +741,9 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
   svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
 
-  svc->reference_last[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]);
-  svc->reference_golden[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]);
-  svc->reference_altref[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
+  svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG);
+  svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG);
+  svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
 }
 
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
@@ -1069,15 +1064,14 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
       svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
       svc->drop_spatial_layer[sl - 1]) {
     MV_REFERENCE_FRAME ref_frame;
-    static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                      VP9_ALT_FLAG };
     for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
       const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-      if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+      if (yv12 != NULL &&
+          (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
         const struct scale_factors *const scale_fac =
             &cm->frame_refs[ref_frame - 1].sf;
         if (vp9_is_scaled(scale_fac)) {
-          cpi->ref_frame_flags &= (~flag_list[ref_frame]);
+          cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame));
           // Point golden/altref frame buffer index to last.
           if (!svc->simulcast_mode) {
             if (ref_frame == GOLDEN_FRAME)

From 6982214de5cc62f1f4dc733f1bcc3ffbd74780b0 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 11 Jan 2022 08:46:59 -0800
Subject: [PATCH 194/926] Revert "Add vp9 ref frame to flag map function"

This reverts commit 44e611482e13fdffa0acde780a20dd68ee153498.

Change-Id: Ic900cc01be4de7983fab42178a488277efab77b3
---
 vp9/encoder/vp9_bitstream.c        |  7 +++---
 vp9/encoder/vp9_encoder.c          |  4 ++-
 vp9/encoder/vp9_encoder.h          | 15 ++++--------
 vp9/encoder/vp9_pickmode.c         | 39 ++++++++++++++++--------------
 vp9/encoder/vp9_rdopt.c            | 16 ++++++------
 vp9/encoder/vp9_speed_features.c   |  5 ++--
 vp9/encoder/vp9_svc_layercontext.c | 18 +++++++++-----
 7 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c5145acf08..7644930c1c 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1241,7 +1241,9 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
-      const int first_ref = get_first_ref_frame(cpi);
+      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                        VP9_ALT_FLAG };
+      const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi);
       const int first_ref_map_idx = get_ref_frame_map_idx(cpi, first_ref);
       MV_REFERENCE_FRAME ref_frame;
       vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
@@ -1249,8 +1251,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       // If a reference frame is not referenced, then set the index for that
       // reference to the first one used/referenced.
       for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
-        const int referenced =
-            cpi->ref_frame_flags & ref_frame_to_flag(ref_frame);
+        const int referenced = cpi->ref_frame_flags & flag_list[ref_frame];
         const int map_idx = referenced ? get_ref_frame_map_idx(cpi, ref_frame)
                                        : first_ref_map_idx;
         assert(map_idx != INVALID_IDX);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 1038bd9515..8d5ec5a360 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -586,6 +586,8 @@ static void apply_roi_map(VP9_COMP *cpi) {
   int ref_frame[8];
   int internal_delta_q[MAX_SEGMENTS];
   int i;
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
 
   // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
   // realtime mode.
@@ -626,7 +628,7 @@ static void apply_roi_map(VP9_COMP *cpi) {
         valid_ref = 0;
       // If GOLDEN is selected, make sure it's set as reference.
       if (ref_frame[i] == GOLDEN_FRAME &&
-          !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) {
+          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
         valid_ref = 0;
       }
       // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 0059e881be..1bca7ded75 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1196,17 +1196,12 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
-static INLINE int ref_frame_to_flag(int8_t ref_frame) {
-  static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                          VP9_ALT_FLAG };
-  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
-  return kVp9RefFlagList[ref_frame];
-}
-
-static INLINE int get_first_ref_frame(VP9_COMP *const cpi) {
-  int ref_frame = LAST_FRAME;
+static INLINE MV_REFERENCE_FRAME get_first_ref_frame(VP9_COMP *const cpi) {
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
   while (ref_frame < MAX_REF_FRAMES) {
-    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) break;
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) break;
     ref_frame++;
   }
   return ref_frame;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index c8e167f25b..695fd484fc 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1247,7 +1247,7 @@ static INLINE void find_predictors(
     VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
     int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
-    TileDataEnc *tile_data, int mi_row, int mi_col,
+    const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
     struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
     int force_skip_low_temp_var, int comp_pred_allowed) {
   VP9_COMMON *const cm = &cpi->common;
@@ -1259,7 +1259,7 @@ static INLINE void find_predictors(
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
   // this needs various further optimizations. to be continued..
-  if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) {
+  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
@@ -1690,6 +1690,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
   RD_COST this_rdc, best_rdc;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
@@ -1923,14 +1925,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
       svc->spatial_layer_id > 0 && !gf_temporal_ref) {
-    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+    if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
       struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[LAST_FRAME - 1] = 1;
         inter_layer_ref = LAST_FRAME;
       }
     }
-    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+    if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
       struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
@@ -1955,7 +1957,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                                  cpi->rc.avg_frame_low_motion < 60))
     usable_ref_frame = LAST_FRAME;
 
-  if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
+  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
 
@@ -1983,11 +1985,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     // Skip find_predictor if the reference frame is not in the
     // ref_frame_flags (i.e., not used as a reference for this frame).
     skip_ref_find_pred[ref_frame] =
-        !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame));
+        !(cpi->ref_frame_flags & flag_list[ref_frame]);
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
-                      &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb,
-                      bsize, force_skip_low_temp_var, comp_modes > 0);
+                      &ref_frame_skip_mask, flag_list, tile_data, mi_row,
+                      mi_col, yv12_mb, bsize, force_skip_low_temp_var,
+                      comp_modes > 0);
     }
   }
 
@@ -2011,7 +2014,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // than current layer: force check of GF-ZEROMV before early exit
   // due to skip flag.
   if (svc->spatial_layer_id > 0 && no_scaling &&
-      (cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
+      (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
       cm->base_qindex > svc->lower_layer_qindex + 10)
     force_test_gf_zeromv = 1;
 
@@ -2091,8 +2094,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
-        continue;
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
@@ -2105,7 +2107,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
          (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden)))
       continue;
 
-    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue;
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
 
     // For screen content. If zero_temp_sad source is computed: skip
     // non-zero motion check for stationary blocks. If the superblock is
@@ -2188,7 +2190,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         if (usable_ref_frame < ALTREF_FRAME) {
           if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
             i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-            if ((cpi->ref_frame_flags & ref_frame_to_flag(i)))
+            if ((cpi->ref_frame_flags & flag_list[i]))
               if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
                 ref_frame_skip_mask |= (1 << ref_frame);
           }
@@ -2197,9 +2199,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                      ref_frame == ALTREF_FRAME)) {
           int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
           int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-          if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) &&
+          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-              ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) &&
+              ((cpi->ref_frame_flags & flag_list[ref2]) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
             ref_frame_skip_mask |= (1 << ref_frame);
         }
@@ -2486,7 +2488,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     perform_intra_pred =
         svc->temporal_layer_id == 0 ||
         svc->layer_context[svc->temporal_layer_id].is_key_frame ||
-        !(cpi->ref_frame_flags & VP9_GOLD_FLAG) ||
+        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
         (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
          svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
@@ -2745,6 +2747,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   MV_REFERENCE_FRAME best_ref_frame = NONE;
   unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   b_mode_info bsi[MAX_REF_FRAMES][4];
   int ref_frame_skip_mask = 0;
@@ -2760,8 +2764,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
     int_mv dummy_mv[2];
     x->pred_mv_sad[ref_frame] = INT_MAX;
 
-    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
-        (yv12 != NULL)) {
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
       int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
       const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0171a05720..a1687dcf46 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3315,6 +3315,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3390,7 +3392,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) &&
         !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -3401,7 +3403,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped if both reference frames
       // are masked out.
@@ -3607,8 +3609,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         continue;
 
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
-        continue;
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
 
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
@@ -4139,6 +4140,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
@@ -4188,7 +4191,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     } else {
@@ -4273,8 +4276,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
           cm->ref_frame_sign_bias[second_ref_frame])
         continue;
 
-      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
-        continue;
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7d7b2c3fb4..81695e9156 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -495,10 +495,11 @@ static void set_rt_speed_feature_framesize_independent(
         (cpi->external_resize == 1 ||
          cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
       MV_REFERENCE_FRAME ref_frame;
+      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                        VP9_ALT_FLAG };
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-        if (yv12 != NULL &&
-            (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
+        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
           const struct scale_factors *const scale_fac =
               &cm->frame_refs[ref_frame - 1].sf;
           if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 30c17fd8e6..f01cb17a2f 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -719,6 +719,8 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
 
 void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
   int sl = svc->spatial_layer_id;
   svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
   svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
@@ -741,9 +743,12 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
   svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
 
-  svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG);
-  svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG);
-  svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
+  svc->reference_last[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]);
+  svc->reference_golden[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]);
+  svc->reference_altref[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
 }
 
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
@@ -1064,14 +1069,15 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
       svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
       svc->drop_spatial_layer[sl - 1]) {
     MV_REFERENCE_FRAME ref_frame;
+    static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                      VP9_ALT_FLAG };
     for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
       const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-      if (yv12 != NULL &&
-          (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
+      if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
         const struct scale_factors *const scale_fac =
             &cm->frame_refs[ref_frame - 1].sf;
         if (vp9_is_scaled(scale_fac)) {
-          cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame));
+          cpi->ref_frame_flags &= (~flag_list[ref_frame]);
           // Point golden/altref frame buffer index to last.
           if (!svc->simulcast_mode) {
             if (ref_frame == GOLDEN_FRAME)

From 51415c4076578d3cbc32fcd0d683161c3e887814 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 11 Jan 2022 08:47:52 -0800
Subject: [PATCH 195/926] Revert "Set unused reference frames to first ref"

This reverts commit e7f33a53cf404bbb3688af9b13375b5c090daae4.

Change-Id: I54e807220885cb78af6f3c6e48b3eb2c9f1e70b4
---
 vp9/encoder/vp9_bitstream.c        | 17 +++------
 vp9/encoder/vp9_encoder.h          | 22 ++++--------
 vp9/encoder/vp9_svc_layercontext.c | 55 ++++++++++++++++++------------
 vpx/vp8cx.h                        | 17 ++++-----
 4 files changed, 50 insertions(+), 61 deletions(-)

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 7644930c1c..99cc2ee831 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1241,21 +1241,12 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
-      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                        VP9_ALT_FLAG };
-      const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi);
-      const int first_ref_map_idx = get_ref_frame_map_idx(cpi, first_ref);
       MV_REFERENCE_FRAME ref_frame;
       vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
-
-      // If a reference frame is not referenced, then set the index for that
-      // reference to the first one used/referenced.
-      for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
-        const int referenced = cpi->ref_frame_flags & flag_list[ref_frame];
-        const int map_idx = referenced ? get_ref_frame_map_idx(cpi, ref_frame)
-                                       : first_ref_map_idx;
-        assert(map_idx != INVALID_IDX);
-        vpx_wb_write_literal(wb, map_idx, REF_FRAMES_LOG2);
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                             REF_FRAMES_LOG2);
         vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
       }
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1bca7ded75..9774a64ccf 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1196,24 +1196,14 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
-static INLINE MV_REFERENCE_FRAME get_first_ref_frame(VP9_COMP *const cpi) {
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
-  MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
-  while (ref_frame < MAX_REF_FRAMES) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) break;
-    ref_frame++;
-  }
-  return ref_frame;
-}
-
 static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
-  switch (ref_frame) {
-    case LAST_FRAME: return cpi->lst_fb_idx;
-    case GOLDEN_FRAME: return cpi->gld_fb_idx;
-    case ALTREF_FRAME: return cpi->alt_fb_idx;
-    default: return INVALID_IDX;
+  if (ref_frame == LAST_FRAME) {
+    return cpi->lst_fb_idx;
+  } else if (ref_frame == GOLDEN_FRAME) {
+    return cpi->gld_fb_idx;
+  } else {
+    return cpi->alt_fb_idx;
   }
 }
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index f01cb17a2f..ad3a8f7afa 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -73,7 +73,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     svc->downsample_filter_type[sl] = BILINEAR;
     svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
     svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark;
-    svc->fb_idx_upd_tl0[sl] = INVALID_IDX;
+    svc->fb_idx_upd_tl0[sl] = -1;
     svc->drop_count[sl] = 0;
     svc->spatial_layer_sync[sl] = 0;
     svc->force_drop_constrained_from_above[sl] = 0;
@@ -462,22 +462,33 @@ static void reset_fb_idx_unused(VP9_COMP *const cpi) {
   // fb_idx for that reference to the first one used/referenced.
   // This is to avoid setting fb_idx for a reference to a slot that is not
   // used/needed (i.e., since that reference is not referenced or refreshed).
-  const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi);
-  const int map_idx = get_ref_frame_map_idx(cpi, first_ref);
-  if (map_idx != INVALID_IDX) {
-    if (!(cpi->ref_frame_flags & VP9_LAST_FLAG ||
-          cpi->ext_refresh_last_frame)) {
-      cpi->lst_fb_idx = map_idx;
-    }
-    if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG ||
-          cpi->ext_refresh_golden_frame)) {
-      cpi->gld_fb_idx = map_idx;
-    }
-    if (!(cpi->ref_frame_flags & VP9_ALT_FLAG ||
-          cpi->ext_refresh_alt_ref_frame)) {
-      cpi->alt_fb_idx = map_idx;
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME first_ref = 0;
+  int first_fb_idx = 0;
+  int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      first_ref = ref_frame;
+      first_fb_idx = fb_idx[ref_frame - 1];
+      break;
     }
   }
+  if (first_ref > 0) {
+    if (first_ref != LAST_FRAME &&
+        !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) &&
+        !cpi->ext_refresh_last_frame)
+      cpi->lst_fb_idx = first_fb_idx;
+    else if (first_ref != GOLDEN_FRAME &&
+             !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+             !cpi->ext_refresh_golden_frame)
+      cpi->gld_fb_idx = first_fb_idx;
+    else if (first_ref != ALTREF_FRAME &&
+             !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) &&
+             !cpi->ext_refresh_alt_ref_frame)
+      cpi->alt_fb_idx = first_fb_idx;
+  }
 }
 
 // Never refresh any reference frame buffers on top temporal layers in
@@ -705,9 +716,9 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
   int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode;
   cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl];
   cpi->ext_refresh_frame_flags_pending = 1;
-  if (svc->reference_last[sl]) cpi->lst_fb_idx = svc->lst_fb_idx[sl];
-  if (svc->reference_golden[sl]) cpi->gld_fb_idx = svc->gld_fb_idx[sl];
-  if (svc->reference_altref[sl]) cpi->alt_fb_idx = svc->alt_fb_idx[sl];
+  cpi->lst_fb_idx = svc->lst_fb_idx[sl];
+  cpi->gld_fb_idx = svc->gld_fb_idx[sl];
+  cpi->alt_fb_idx = svc->alt_fb_idx[sl];
   cpi->ext_refresh_last_frame = 0;
   cpi->ext_refresh_golden_frame = 0;
   cpi->ext_refresh_alt_ref_frame = 0;
@@ -864,9 +875,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     // flags are passed via the encode call (bypass mode). Issue is that we're
     // resetting ext_refresh_frame_flags_pending to 0 on frame drops.
     if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-      memset(&svc->lst_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx));
-      memset(&svc->gld_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx));
-      memset(&svc->alt_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx));
+      memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx));
       // These are set by API before the superframe is encoded and they are
       // passed to encoder layer by layer. Don't reset them on layer 0 in bypass
       // mode.
@@ -959,7 +970,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
 
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
       svc->last_layer_dropped[svc->spatial_layer_id] &&
-      svc->fb_idx_upd_tl0[svc->spatial_layer_id] != INVALID_IDX &&
+      svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 &&
       !svc->layer_context[svc->temporal_layer_id].is_key_frame) {
     // For fixed/non-flexible mode, if the previous frame (same spatial layer
     // from previous superframe) was dropped, make sure the lst_fb_idx
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index b3c50a9b67..47c38d3b5e 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -897,16 +897,13 @@ typedef struct vpx_svc_ref_frame_config {
   int alt_fb_idx[VPX_SS_MAX_LAYERS];         /**< Altref buffer index. */
   int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */
   // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated.
-  int update_last[VPX_SS_MAX_LAYERS];    /**< Update last. */
-  int update_golden[VPX_SS_MAX_LAYERS];  /**< Update golden. */
-  int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */
-  int reference_last[VPX_SS_MAX_LAYERS];
-  /**< Last as reference. Use first referenced index if FALSE. */
-  int reference_golden[VPX_SS_MAX_LAYERS];
-  /**< Golden as reference. Use first referenced index if FALSE. */
-  int reference_alt_ref[VPX_SS_MAX_LAYERS];
-  /**< Altref as reference. Use first referenced index if FALSE. */
-  int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */
+  int update_last[VPX_SS_MAX_LAYERS];       /**< Update last. */
+  int update_golden[VPX_SS_MAX_LAYERS];     /**< Update golden. */
+  int update_alt_ref[VPX_SS_MAX_LAYERS];    /**< Update altref. */
+  int reference_last[VPX_SS_MAX_LAYERS];    /**< Last as reference. */
+  int reference_golden[VPX_SS_MAX_LAYERS];  /**< Golden as reference. */
+  int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */
+  int64_t duration[VPX_SS_MAX_LAYERS];      /**< Duration per spatial layer. */
 } vpx_svc_ref_frame_config_t;
 
 /*!\brief VP9 svc frame dropping mode.

From 82014b6675ef9acf20cb2bb42c83f95d9e33906b Mon Sep 17 00:00:00 2001
From: Jianhui Dai <jianhui.j.dai@intel.com>
Date: Sat, 1 Jan 2022 08:01:48 +0800
Subject: [PATCH 196/926] Reland "Add vp9 ref frame to flag map function"

Original change's description:
> Add vp9 ref frame to flag map function
>
> Change-Id: I371c2346b9e0153c0f8053cab399ce14cd286c56

Change-Id: I04a407ee0ef66c01a0d224b4468e043213f8791f
---
 vp9/encoder/vp9_encoder.c          |  4 +--
 vp9/encoder/vp9_encoder.h          |  7 ++++++
 vp9/encoder/vp9_pickmode.c         | 39 ++++++++++++++----------------
 vp9/encoder/vp9_rdopt.c            | 16 ++++++------
 vp9/encoder/vp9_speed_features.c   |  5 ++--
 vp9/encoder/vp9_svc_layercontext.c | 29 ++++++++--------------
 6 files changed, 45 insertions(+), 55 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8d5ec5a360..1038bd9515 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -586,8 +586,6 @@ static void apply_roi_map(VP9_COMP *cpi) {
   int ref_frame[8];
   int internal_delta_q[MAX_SEGMENTS];
   int i;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
 
   // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
   // realtime mode.
@@ -628,7 +626,7 @@ static void apply_roi_map(VP9_COMP *cpi) {
         valid_ref = 0;
       // If GOLDEN is selected, make sure it's set as reference.
       if (ref_frame[i] == GOLDEN_FRAME &&
-          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
+          !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) {
         valid_ref = 0;
       }
       // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 9774a64ccf..1d58945250 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1196,6 +1196,13 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
+static INLINE int ref_frame_to_flag(int8_t ref_frame) {
+  static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                          VP9_ALT_FLAG };
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+  return kVp9RefFlagList[ref_frame];
+}
+
 static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 695fd484fc..c8e167f25b 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1247,7 +1247,7 @@ static INLINE void find_predictors(
     VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
     int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
-    const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
+    TileDataEnc *tile_data, int mi_row, int mi_col,
     struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
     int force_skip_low_temp_var, int comp_pred_allowed) {
   VP9_COMMON *const cm = &cpi->common;
@@ -1259,7 +1259,7 @@ static INLINE void find_predictors(
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
   // this needs various further optimizations. to be continued..
-  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+  if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
@@ -1690,8 +1690,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   RD_COST this_rdc, best_rdc;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
@@ -1925,14 +1923,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
       svc->spatial_layer_id > 0 && !gf_temporal_ref) {
-    if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
       struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[LAST_FRAME - 1] = 1;
         inter_layer_ref = LAST_FRAME;
       }
     }
-    if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
       struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
@@ -1957,7 +1955,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                                  cpi->rc.avg_frame_low_motion < 60))
     usable_ref_frame = LAST_FRAME;
 
-  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+  if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
 
@@ -1985,12 +1983,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     // Skip find_predictor if the reference frame is not in the
     // ref_frame_flags (i.e., not used as a reference for this frame).
     skip_ref_find_pred[ref_frame] =
-        !(cpi->ref_frame_flags & flag_list[ref_frame]);
+        !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame));
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
-                      &ref_frame_skip_mask, flag_list, tile_data, mi_row,
-                      mi_col, yv12_mb, bsize, force_skip_low_temp_var,
-                      comp_modes > 0);
+                      &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb,
+                      bsize, force_skip_low_temp_var, comp_modes > 0);
     }
   }
 
@@ -2014,7 +2011,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // than current layer: force check of GF-ZEROMV before early exit
   // due to skip flag.
   if (svc->spatial_layer_id > 0 && no_scaling &&
-      (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      (cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
       cm->base_qindex > svc->lower_layer_qindex + 10)
     force_test_gf_zeromv = 1;
 
@@ -2094,7 +2091,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
@@ -2107,7 +2105,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
          (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden)))
       continue;
 
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue;
 
     // For screen content. If zero_temp_sad source is computed: skip
     // non-zero motion check for stationary blocks. If the superblock is
@@ -2190,7 +2188,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         if (usable_ref_frame < ALTREF_FRAME) {
           if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
             i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-            if ((cpi->ref_frame_flags & flag_list[i]))
+            if ((cpi->ref_frame_flags & ref_frame_to_flag(i)))
               if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
                 ref_frame_skip_mask |= (1 << ref_frame);
           }
@@ -2199,9 +2197,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                      ref_frame == ALTREF_FRAME)) {
           int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
           int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+          if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-              ((cpi->ref_frame_flags & flag_list[ref2]) &&
+              ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
             ref_frame_skip_mask |= (1 << ref_frame);
         }
@@ -2488,7 +2486,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     perform_intra_pred =
         svc->temporal_layer_id == 0 ||
         svc->layer_context[svc->temporal_layer_id].is_key_frame ||
-        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
+        !(cpi->ref_frame_flags & VP9_GOLD_FLAG) ||
         (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
          svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
@@ -2747,8 +2745,6 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   MV_REFERENCE_FRAME best_ref_frame = NONE;
   unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   b_mode_info bsi[MAX_REF_FRAMES][4];
   int ref_frame_skip_mask = 0;
@@ -2764,7 +2760,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
     int_mv dummy_mv[2];
     x->pred_mv_sad[ref_frame] = INT_MAX;
 
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+        (yv12 != NULL)) {
       int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
       const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a1687dcf46..0171a05720 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3315,8 +3315,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3392,7 +3390,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) &&
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
         !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -3403,7 +3401,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped if both reference frames
       // are masked out.
@@ -3609,7 +3607,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         continue;
 
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
 
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
@@ -4140,8 +4139,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
@@ -4191,7 +4188,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     } else {
@@ -4276,7 +4273,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
           cm->ref_frame_sign_bias[second_ref_frame])
         continue;
 
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 81695e9156..7d7b2c3fb4 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -495,11 +495,10 @@ static void set_rt_speed_feature_framesize_independent(
         (cpi->external_resize == 1 ||
          cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
       MV_REFERENCE_FRAME ref_frame;
-      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                        VP9_ALT_FLAG };
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+        if (yv12 != NULL &&
+            (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
           const struct scale_factors *const scale_fac =
               &cm->frame_refs[ref_frame - 1].sf;
           if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index ad3a8f7afa..6655643654 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -462,30 +462,27 @@ static void reset_fb_idx_unused(VP9_COMP *const cpi) {
   // fb_idx for that reference to the first one used/referenced.
   // This is to avoid setting fb_idx for a reference to a slot that is not
   // used/needed (i.e., since that reference is not referenced or refreshed).
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME first_ref = 0;
   int first_fb_idx = 0;
   int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
       first_ref = ref_frame;
       first_fb_idx = fb_idx[ref_frame - 1];
       break;
     }
   }
   if (first_ref > 0) {
-    if (first_ref != LAST_FRAME &&
-        !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) &&
+    if (first_ref != LAST_FRAME && !(cpi->ref_frame_flags & VP9_LAST_FLAG) &&
         !cpi->ext_refresh_last_frame)
       cpi->lst_fb_idx = first_fb_idx;
     else if (first_ref != GOLDEN_FRAME &&
-             !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+             !(cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
              !cpi->ext_refresh_golden_frame)
       cpi->gld_fb_idx = first_fb_idx;
     else if (first_ref != ALTREF_FRAME &&
-             !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) &&
+             !(cpi->ref_frame_flags & VP9_ALT_FLAG) &&
              !cpi->ext_refresh_alt_ref_frame)
       cpi->alt_fb_idx = first_fb_idx;
   }
@@ -730,8 +727,6 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
 
 void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int sl = svc->spatial_layer_id;
   svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
   svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
@@ -754,12 +749,9 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
   svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
 
-  svc->reference_last[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]);
-  svc->reference_golden[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]);
-  svc->reference_altref[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
+  svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG);
+  svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG);
+  svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
 }
 
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
@@ -1080,15 +1072,14 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
       svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
       svc->drop_spatial_layer[sl - 1]) {
     MV_REFERENCE_FRAME ref_frame;
-    static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                      VP9_ALT_FLAG };
     for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
       const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-      if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+      if (yv12 != NULL &&
+          (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
         const struct scale_factors *const scale_fac =
             &cm->frame_refs[ref_frame - 1].sf;
         if (vp9_is_scaled(scale_fac)) {
-          cpi->ref_frame_flags &= (~flag_list[ref_frame]);
+          cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame));
           // Point golden/altref frame buffer index to last.
           if (!svc->simulcast_mode) {
             if (ref_frame == GOLDEN_FRAME)

From 395732f679e3f7842f5b2094e3a91de036f85708 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 25 Jan 2022 20:06:59 -0800
Subject: [PATCH 197/926] libwebm: update to libwebm-1.0.0.28-28-gee0bab5

https://chromium.googlesource.com/webm/libwebm/+log/206d268d4d8066e5a37c49025325b80c95c771dd..ee0bab576c338c9807249b99588e352b7268cb62

only one commit affects this snapshot:
ee0bab5 Revert "mkvmuxer,Cluster::Size: make uint64 conversion explicit"

Change-Id: Ib1f21fc5589098af346d110ff88c94bb1ba0a027
---
 third_party/libwebm/README.libvpx        | 2 +-
 third_party/libwebm/mkvmuxer/mkvmuxer.cc | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 5cc0a83701..325604cc66 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 206d268d4d8066e5a37c49025325b80c95c771dd
+Version: ee0bab576c338c9807249b99588e352b7268cb62
 License: BSD
 License File: LICENSE.txt
 
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 24c288863f..ae36531439 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -2622,8 +2622,7 @@ bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) {
 
 uint64_t Cluster::Size() const {
   const uint64_t element_size =
-      EbmlMasterElementSize(static_cast<uint64_t>(libwebm::kMkvCluster),
-                            uint64_t{0xFFFFFFFFFFFFFFFFU}) +
+      EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) +
       payload_size_;
   return element_size;
 }

From ae5d16173d2af45dcb80d43635f53129a045b946 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 26 Jan 2022 15:05:22 -0800
Subject: [PATCH 198/926] fix some include guards

Change-Id: I0233d352c134bdda3ca160d41b4671d1c45ab01c
---
 vpx/internal/vpx_ratectrl_rtc.h | 6 +++---
 vpx_ports/mips.h                | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h
index 0474e0a85b..65398c654d 100644
--- a/vpx/internal/vpx_ratectrl_rtc.h
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VPX_RATECTRL_RTC_H_
-#define VPX_VPX_RATECTRL_RTC_H_
+#ifndef VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
+#define VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
 
 #include "vpx/vpx_encoder.h"
 
@@ -59,4 +59,4 @@ struct VpxRateControlRtcConfig {
   int aq_mode;
 };
 }  // namespace libvpx
-#endif
+#endif  // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
diff --git a/vpx_ports/mips.h b/vpx_ports/mips.h
index bdc7525f7b..439de754fd 100644
--- a/vpx_ports/mips.h
+++ b/vpx_ports/mips.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MIPS_H_
-#define VPX_PORTS_MIPS_H_
+#ifndef VPX_VPX_PORTS_MIPS_H_
+#define VPX_VPX_PORTS_MIPS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,4 +24,4 @@ int mips_cpu_caps(void);
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_MIPS_H_
+#endif  // VPX_VPX_PORTS_MIPS_H_

From 935350958690c086c16ec645e4da53d241cd36bf Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 26 Jan 2022 19:41:41 -0800
Subject: [PATCH 199/926] vp8dx.h: add missing define for
 VP9_SET_BYTE_ALIGNMENT

Change-Id: I4e643c837bb010bd58f4fc8179045f8df18f8ae1
---
 vpx/vp8dx.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index af92f21ae3..08c3451445 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -189,6 +189,8 @@ VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
 #define VPX_CTRL_VP9D_GET_BIT_DEPTH
 VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_FRAME_SIZE
+VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int)
+#define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
 #define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER

From 531c60e2a2c0a98a0754502f3e6c28f3d5002c4d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 26 Jan 2022 19:44:33 -0800
Subject: [PATCH 200/926] vp8dx.h,cosmetics: normalize #define/type order

Change-Id: I2db20130cc366bead5e576b375479917f9aee024
---
 vpx/vp8dx.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 08c3451445..dcf7a62860 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -193,14 +193,14 @@ VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int)
 #define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
-#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
 VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
-#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
 VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
-#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
+#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
 VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int)
-#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
+#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
 VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int)
+#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */

From 8a0af65f34bdf43fc63b4ce4ac9393aceab0abbf Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 28 Sep 2021 16:59:21 -0700
Subject: [PATCH 201/926] Use background segmentation mask with ROI

RTC sample encoder vpx_temporal_svc_encoder can take mask files as input
when ROI_MAP is set to 1.

Uses ROI and segmentation of vp9 to skip background encoding when
source_sad is low and the correspond block in previous frame is also
skipped.

Change-Id: I8590e6f9a88cecfa1d7f375d4cc480f0f2af87b6
---
 examples/vpx_temporal_svc_encoder.c |  99 +++++++++++++++++--
 test/test-data.mk                   |   1 +
 test/test-data.sha1                 |   1 +
 test/test.mk                        |   1 +
 test/vp9_roi_test.cc                | 143 ++++++++++++++++++++++++++++
 vp9/common/vp9_seg_common.h         |   5 +
 vp9/encoder/vp9_aq_cyclicrefresh.c  |   4 +-
 vp9/encoder/vp9_encodeframe.c       |  42 ++++++--
 vp9/encoder/vp9_encoder.c           |  19 +++-
 9 files changed, 292 insertions(+), 23 deletions(-)
 create mode 100644 test/vp9_roi_test.cc

diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index ad3e79c713..e528179f3f 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -240,6 +240,38 @@ static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
     }
   }
 }
+
+static void set_roi_skip_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi,
+                             int *skip_map, int *prev_mask_map, int frame_num) {
+  const int block_size = 8;
+  unsigned int i, j;
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;
+  zero(roi->skip);
+  zero(roi->delta_q);
+  zero(roi->delta_lf);
+  memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+  roi->ref_frame[1] = 1;
+  // Use segment 3 for skip.
+  roi->skip[3] = 1;
+  roi->roi_map =
+      (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+  for (i = 0; i < roi->rows; ++i) {
+    for (j = 0; j < roi->cols; ++j) {
+      const int idx = i * roi->cols + j;
+      // Use segment 3 for skip.
+      // prev_mask_map keeps track of blocks that have been stably on segment 3
+      // for the past 10 frames. Only skip when the block is on segment 3 in
+      // both current map and prev_mask_map.
+      if (skip_map[idx] == 1 && prev_mask_map[idx] == 1) roi->roi_map[idx] = 3;
+      // Reset it every 10 frames so it doesn't propagate for too many frames.
+      if (frame_num % 10 == 0)
+        prev_mask_map[idx] = skip_map[idx];
+      else if (prev_mask_map[idx] == 1 && skip_map[idx] == 0)
+        prev_mask_map[idx] = 0;
+    }
+  }
+}
 #endif
 
 // Temporal scaling parameters:
@@ -574,6 +606,23 @@ static void set_temporal_layer_pattern(int layering_mode,
   }
 }
 
+#if ROI_MAP
+static void read_mask(FILE *mask_file, int *seg_map) {
+  int mask_rows, mask_cols, i, j;
+  int *map_start = seg_map;
+  fscanf(mask_file, "%d %d\n", &mask_cols, &mask_rows);
+  for (i = 0; i < mask_rows; i++) {
+    for (j = 0; j < mask_cols; j++) {
+      fscanf(mask_file, "%d ", &seg_map[j]);
+      // reverse the bit
+      seg_map[j] = 1 - seg_map[j];
+    }
+    seg_map += mask_cols;
+  }
+  seg_map = map_start;
+}
+#endif
+
 int main(int argc, char **argv) {
   VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL };
   vpx_codec_ctx_t codec;
@@ -613,7 +662,14 @@ int main(int argc, char **argv) {
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
-
+#if ROI_MAP
+  FILE *mask_file = NULL;
+  int block_size = 8;
+  int mask_rows = 0;
+  int mask_cols = 0;
+  int *mask_map;
+  int *prev_mask_map;
+#endif
   zero(rc.layer_target_bitrate);
   memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t));
   memset(&input_ctx, 0, sizeof(input_ctx));
@@ -657,9 +713,15 @@ int main(int argc, char **argv) {
     die("Invalid layering mode (0..12) %s", argv[12]);
   }
 
+#if ROI_MAP
+  if (argc != min_args + mode_to_num_layers[layering_mode] + 1) {
+    die("Invalid number of arguments");
+  }
+#else
   if (argc != min_args + mode_to_num_layers[layering_mode]) {
     die("Invalid number of arguments");
   }
+#endif
 
   input_ctx.filename = argv[1];
   open_input_file(&input_ctx);
@@ -817,6 +879,13 @@ int main(int argc, char **argv) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     die("Failed to initialize encoder");
 
+#if ROI_MAP
+  mask_rows = (cfg.g_h + block_size - 1) / block_size;
+  mask_cols = (cfg.g_w + block_size - 1) / block_size;
+  mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map));
+  prev_mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map));
+#endif
+
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
@@ -827,7 +896,6 @@ int main(int argc, char **argv) {
     if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
       die_codec(&codec, "Failed to set ROI map");
 #endif
-
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
@@ -843,12 +911,7 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads));
     vpx_codec_control(&codec, VP9E_SET_DISABLE_LOOPFILTER, 0);
-#if ROI_MAP
-    set_roi_map(encoder->name, &cfg, &roi);
-    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
-      die_codec(&codec, "Failed to set ROI map");
-    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
-#endif
+
     if (cfg.g_threads > 1)
       vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
     else
@@ -881,6 +944,9 @@ int main(int argc, char **argv) {
     struct vpx_usec_timer timer;
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *pkt;
+#if ROI_MAP
+    char mask_file_name[255];
+#endif
     // Update the temporal layer_id. No spatial layers in this test.
     layer_id.spatial_layer_id = 0;
     layer_id.temporal_layer_id =
@@ -894,6 +960,19 @@ int main(int argc, char **argv) {
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
     if (layering_mode == 0) flags = 0;
+#if ROI_MAP
+    snprintf(mask_file_name, sizeof(mask_file_name), "%s%05d.txt",
+             argv[argc - 1], frame_cnt);
+    mask_file = fopen(mask_file_name, "r");
+    if (mask_file != NULL) {
+      read_mask(mask_file, mask_map);
+      fclose(mask_file);
+      // set_roi_map(encoder->name, &cfg, &roi);
+      set_roi_skip_map(&cfg, &roi, mask_map, prev_mask_map, frame_cnt);
+      if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
+        die_codec(&codec, "Failed to set ROI map");
+    }
+#endif
     frame_avail = read_frame(&input_ctx, &raw);
     if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id];
     vpx_usec_timer_start(&timer);
@@ -963,6 +1042,10 @@ int main(int argc, char **argv) {
     ++frame_cnt;
     pts += frame_duration;
   }
+#if ROI_MAP
+  free(mask_map);
+  free(prev_mask_map);
+#endif
   close_input_file(&input_ctx);
   printout_rate_control_summary(&rc, &cfg, frame_cnt);
   printf("\n");
diff --git a/test/test-data.mk b/test/test-data.mk
index 46fe359898..62a9d6ef14 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -6,6 +6,7 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288_nv12.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktopqvga.320_240.yuv
 
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 668992fba2..55f92a25df 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -869,3 +869,4 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv
 518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
 ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
+8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv
diff --git a/test/test.mk b/test/test.mk
index 41dfd5d835..6df4572904 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -156,6 +156,7 @@ LIBVPX_TEST_SRCS-yes                   += superframe_test.cc
 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_roi_test.cc
 endif
 
 LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc
new file mode 100644
index 0000000000..52dfd9e029
--- /dev/null
+++ b/test/vp9_roi_test.cc
@@ -0,0 +1,143 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "memory"
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+#define MASK_WIDTH 40
+#define MASK_HEIGHT 30
+#define MASK_SIZE MASK_WIDTH *MASK_HEIGHT
+
+namespace {
+
+const int mask[MASK_SIZE] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0
+};
+
+class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest,
+                              public ::testing::Test {
+ protected:
+  RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {}
+  virtual ~RoiMaskBackgroundSkip() { free(roi_.roi_map); }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    SetRoi();
+  }
+
+  void SetRoi() {
+    const int block_size = 8;
+    unsigned int i, j;
+    roi_.rows = (cfg_.g_h + block_size - 1) / block_size;
+    roi_.cols = (cfg_.g_w + block_size - 1) / block_size;
+    memset(&roi_.skip, 0, sizeof(roi_.skip));
+    memset(&roi_.delta_q, 0, sizeof(roi_.delta_q));
+    memset(&roi_.delta_lf, 0, sizeof(roi_.delta_lf));
+    memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+    roi_.ref_frame[1] = 1;
+    // Use segment 3 for skip.
+    roi_.skip[3] = 1;
+    roi_.roi_map =
+        (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+    for (i = 0; i < roi_.rows; ++i) {
+      for (j = 0; j < roi_.cols; ++j) {
+        const int idx = i * roi_.cols + j;
+        if (mask[idx] == 1) roi_.roi_map[idx] = 3;
+      }
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+    }
+    encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+  }
+
+ private:
+  vpx_roi_map_t roi_;
+};
+
+TEST_F(RoiMaskBackgroundSkip, RoiMaskNoMismatch) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::I420VideoSource video("desktopqvga.320_240.yuv", 320, 240, 30,
+                                       1, 0, 150);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+}  // namespace
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index b63e4f4999..5e71c2fca5 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -25,6 +25,11 @@ extern "C" {
 
 #define PREDICTION_PROBS 3
 
+// Segment ID used to skip background encoding
+#define BACKGROUND_SEG_SKIP_ID 3
+// Number of frames that don't skip after a key frame
+#define FRAMES_NO_SKIPPING_AFTER_KEY 20
+
 // Segment level features.
 typedef enum {
   SEG_LVL_ALT_Q = 0,      // Use alternate Quantizer ....
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index f06fe47268..e336179e90 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -497,7 +497,9 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
        rc->avg_frame_low_motion < thresh_low_motion &&
        rc->frames_since_key > 40) ||
       (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh &&
-       rc->frames_since_key > 20)) {
+       rc->frames_since_key > 20) ||
+      (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+       rc->frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 131c4887f2..fc4089865d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -5513,16 +5513,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     x->arf_frame_usage = 0;
     x->lastgolden_frame_usage = 0;
 
-    if (seg->enabled) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
-      if (seg_skip) {
-        partition_search_type = FIXED_PARTITION;
-      }
-    }
-
     if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
       int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3);
       int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
@@ -5534,6 +5524,38 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         partition_search_type = REFERENCE_PARTITION;
     }
 
+    if (seg->enabled) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+
+      if (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+          cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY &&
+          x->content_state_sb > kLowSadLowSumdiff) {
+        // For ROI with skip, force segment = 0 (no skip) over whole
+        // superblock to avoid artifacts if temporal change in source_sad is
+        // not 0.
+        int xi, yi;
+        const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+        const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+        const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+        const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+        const int block_index = mi_row * cm->mi_cols + mi_col;
+        set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+        for (yi = 0; yi < ymis; yi++)
+          for (xi = 0; xi < xmis; xi++) {
+            int map_offset = block_index + yi * cm->mi_cols + xi;
+            cpi->segmentation_map[map_offset] = 0;
+          }
+        set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0);
+        seg_skip = 0;
+      }
+      if (seg_skip) {
+        partition_search_type = FIXED_PARTITION;
+      }
+    }
+
     // Set the partition type of the 64X64 block
     switch (partition_search_type) {
       case VAR_BASED_PARTITION:
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 7e80835f6c..ac0efced7f 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -618,7 +618,7 @@ static void apply_roi_map(VP9_COMP *cpi) {
     }
     if (skip[i] != 0) {
       vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
-      vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
+      vp9_set_segdata(seg, i, SEG_LVL_SKIP, 0);
     }
     if (ref_frame[i] >= 0) {
       int valid_ref = 1;
@@ -4137,11 +4137,22 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
   } else {
 #endif
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    // If ROI is enabled and skip feature is used for segmentation, apply cyclic
+    // refresh but not apply ROI for skip for the first 20 frames (defined by
+    // FRAMES_NO_SKIPPING_AFTER_KEY) after key frame to improve quality.
+    if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
+      if (cpi->roi.skip[BACKGROUND_SEG_SKIP_ID]) {
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+          vp9_cyclic_refresh_setup(cpi);
+        if (cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)
+          apply_roi_map(cpi);
+      } else {
+        apply_roi_map(cpi);
+      }
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp9_cyclic_refresh_setup(cpi);
-    } else if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
-      apply_roi_map(cpi);
     }
+
 #if !CONFIG_REALTIME_ONLY
   }
 #endif

From 479758aeb15c82c8faf6d7a905999c7512284c64 Mon Sep 17 00:00:00 2001
From: Jin Bo <jinbo@loongson.cn>
Date: Tue, 6 Jul 2021 17:18:48 +0800
Subject: [PATCH 202/926] libvpx[loongarch]: Add loongarch support.

LSX and LASX are enabled by default if compiler supports them.

Bug: webm:1754

Change-Id: Ic36b113bc4313c50e9d2bbab91199b3aa46d00dc
---
 build/make/Makefile                  |  6 ++++
 build/make/configure.sh              | 47 ++++++++++++++++++++++++++++
 build/make/rtcd.pl                   | 47 ++++++++++++++++++++++++++++
 configure                            |  5 +++
 vp8/common/generic/systemdependent.c |  4 +++
 vpx_ports/loongarch.h                | 29 +++++++++++++++++
 vpx_ports/loongarch_cpudetect.c      | 40 +++++++++++++++++++++++
 vpx_ports/vpx_ports.mk               |  3 ++
 8 files changed, 181 insertions(+)
 create mode 100644 vpx_ports/loongarch.h
 create mode 100644 vpx_ports/loongarch_cpudetect.c

diff --git a/build/make/Makefile b/build/make/Makefile
index 9ca97c8c64..b7a873cc81 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -151,6 +151,12 @@ $(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx
 $(BUILD_PFX)%_msa.c.d: CFLAGS += -mmsa
 $(BUILD_PFX)%_msa.c.o: CFLAGS += -mmsa
 
+# LOONGARCH
+$(BUILD_PFX)%_lsx.c.d:  CFLAGS += -mlsx
+$(BUILD_PFX)%_lsx.c.o:  CFLAGS += -mlsx
+$(BUILD_PFX)%_lasx.c.d: CFLAGS += -mlasx
+$(BUILD_PFX)%_lasx.c.o: CFLAGS += -mlasx
+
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
diff --git a/build/make/configure.sh b/build/make/configure.sh
index b24e79a0d2..581042e38e 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -449,6 +449,17 @@ EOF
   fi
 }
 
+check_inline_asm() {
+  log check_inline_asm "$@"
+  name="$1"
+  code="$2"
+  shift 2
+  disable_feature $name
+  check_cc "$@" <<EOF && enable_feature $name
+void foo(void) { __asm__ volatile($code); }
+EOF
+}
+
 write_common_config_banner() {
   print_webm_license config.mk "##" ""
   echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@@ -766,6 +777,12 @@ process_common_toolchain() {
       *mips32el*)
         tgt_isa=mips32
         ;;
+      loongarch32*)
+        tgt_isa=loongarch32
+        ;;
+      loongarch64*)
+        tgt_isa=loongarch64
+        ;;
     esac
 
     # detect tgt_os
@@ -834,6 +851,11 @@ process_common_toolchain() {
     ppc*)
       enable_feature ppc
       ;;
+    loongarch*)
+      soft_enable lsx
+      soft_enable lasx
+      enable_feature loongarch
+      ;;
   esac
 
   # PIC is probably what we want when building shared libs
@@ -1419,6 +1441,15 @@ EOF
           ;;
       esac
       ;;
+    loongarch*)
+      link_with_cc=gcc
+      setup_gnu_toolchain
+
+      enabled lsx && check_inline_asm lsx '"vadd.b $vr0, $vr1, $vr1"'
+      enabled lsx && soft_enable runtime_cpu_detect
+      enabled lasx && check_inline_asm lasx '"xvadd.b $xr0, $xr1, $xr1"'
+      enabled lasx && soft_enable runtime_cpu_detect
+      ;;
     *-gcc|generic-gnu)
       link_with_cc=gcc
       enable_feature gcc
@@ -1521,6 +1552,22 @@ EOF
       ;;
   esac
 
+  # only for LOONGARCH platforms
+  case ${toolchain} in
+    loongarch*)
+      if enabled big_endian; then
+        if enabled lsx; then
+          echo "lsx optimizations are available only for little endian platforms"
+          disable_feature lsx
+        fi
+        if enabled lasx; then
+          echo "lasx optimizations are available only for little endian platforms"
+          disable_feature lasx
+        fi
+      fi
+      ;;
+  esac
+
   # glibc needs these
   if enabled linux; then
     add_cflags -D_LARGEFILE_SOURCE
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index acb9f6e466..8ed776add8 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -387,6 +387,37 @@ ()
   common_bottom;
 }
 
+sub loongarch() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/loongarch.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = loongarch_cpu_caps();
+
+    (void)flags;
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
 sub unoptimized() {
   determine_indirection "c";
   common_top;
@@ -462,6 +493,22 @@ ()
 } elsif ($opts{arch} =~ /^ppc/ ) {
   @ALL_ARCHS = filter(qw/vsx/);
   ppc;
+} elsif ($opts{arch} =~ /loongarch/ ) {
+  @ALL_ARCHS = filter("$opts{arch}");
+  open CONFIG_FILE, $opts{config} or
+    die "Error opening config file '$opts{config}': $!\n";
+  while (<CONFIG_FILE>) {
+    if (/HAVE_LSX=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/lsx/);
+      last;
+    }
+    if (/HAVE_LASX=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/lasx/);
+      last;
+    }
+  }
+  close CONFIG_FILE;
+  loongarch;
 } else {
   unoptimized;
 }
diff --git a/configure b/configure
index b68f9fd781..434ebbe366 100755
--- a/configure
+++ b/configure
@@ -114,6 +114,8 @@ all_platforms="${all_platforms} armv7-win32-vs14"
 all_platforms="${all_platforms} armv7-win32-vs15"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
+all_platforms="${all_platforms} loongarch32-linux-gcc"
+all_platforms="${all_platforms} loongarch64-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
 all_platforms="${all_platforms} ppc64le-linux-gcc"
@@ -237,6 +239,7 @@ ARCH_LIST="
     x86
     x86_64
     ppc
+    loongarch
 "
 ARCH_EXT_LIST_X86="
     mmx
@@ -252,6 +255,8 @@ ARCH_EXT_LIST_X86="
 
 ARCH_EXT_LIST_LOONGSON="
     mmi
+    lsx
+    lasx
 "
 
 ARCH_EXT_LIST="
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index cd1b02c9cc..71529bdfd8 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -18,6 +18,8 @@
 #include "vpx_ports/ppc.h"
 #elif VPX_ARCH_MIPS
 #include "vpx_ports/mips.h"
+#elif VPX_ARCH_LOONGARCH
+#include "vpx_ports/loongarch.h"
 #endif
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/systemdependent.h"
@@ -100,6 +102,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) {
   ctx->cpu_caps = ppc_simd_caps();
 #elif VPX_ARCH_MIPS
   ctx->cpu_caps = mips_cpu_caps();
+#elif VPX_ARCH_LOONGARCH
+  ctx->cpu_caps = loongarch_cpu_caps();
 #else
   // generic-gnu targets.
   ctx->cpu_caps = 0;
diff --git a/vpx_ports/loongarch.h b/vpx_ports/loongarch.h
new file mode 100644
index 0000000000..d93ff9f5f0
--- /dev/null
+++ b/vpx_ports/loongarch.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Jin Bo  <jinbo@loongson.cn>
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_LOONGARCH_H_
+#define VPX_VPX_PORTS_LOONGARCH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_LSX 0x01
+#define HAS_LASX 0x02
+
+int loongarch_cpu_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_LOONGARCH_H_
diff --git a/vpx_ports/loongarch_cpudetect.c b/vpx_ports/loongarch_cpudetect.c
new file mode 100644
index 0000000000..7b4322d35e
--- /dev/null
+++ b/vpx_ports/loongarch_cpudetect.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Jin Bo  <jinbo@loongson.cn>
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_ports/loongarch.h"
+
+#define LOONGARCH_CFG2 0x02
+#define LOONGARCH_CFG2_LSX (1 << 6)
+#define LOONGARCH_CFG2_LASX (1 << 7)
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#if defined(__loongarch__) && defined(__linux__)
+int loongarch_cpu_caps(void) {
+  int reg = 0;
+  int flag = 0;
+
+  __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(reg) : "r"(LOONGARCH_CFG2));
+  if (reg & LOONGARCH_CFG2_LSX) flag |= HAS_LSX;
+
+  if (reg & LOONGARCH_CFG2_LASX) flag |= HAS_LASX;
+
+  return flag;
+}
+#else /* end __loongarch__ && __linux__ */
+#error \
+    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
+#else /* end CONFIG_RUNTIME_CPU_DETECT */
+int loongarch_cpu_caps(void) { return 0; }
+#endif
diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
index e5001be496..e30e87cefb 100644
--- a/vpx_ports/vpx_ports.mk
+++ b/vpx_ports/vpx_ports.mk
@@ -45,6 +45,9 @@ PORTS_SRCS-$(VPX_ARCH_PPC) += ppc.h
 PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c
 PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h
 
+PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch_cpudetect.c
+PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch.h
+
 ifeq ($(VPX_ARCH_MIPS), yes)
 PORTS_SRCS-yes += asmdefs_mmi.h
 endif

From 0494625b7b386d6634c19b47d39e1608c3a5bcec Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 28 Jan 2022 11:47:08 -0800
Subject: [PATCH 203/926] vpx/vp8[cd]x.h,cosmetics: normalize ctrls to enum
 order

Change-Id: I49bbd956b3a64008d1abe54de87d7831bc3eede6
---
 vpx/vp8cx.h | 121 ++++++++++++++++++----------------------------------
 vpx/vp8dx.h |  12 +++---
 2 files changed, 47 insertions(+), 86 deletions(-)

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 47c38d3b5e..6b02aa8657 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -952,28 +952,12 @@ typedef struct vpx_svc_spatial_layer_sync {
  *
  */
 
-VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
-#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
-VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
-#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP8E_SET_ROI_MAP
-VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
-#define VPX_CTRL_VP9E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP8E_SET_ACTIVEMAP
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
 #define VPX_CTRL_VP8E_SET_SCALEMODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
-#define VPX_CTRL_VP9E_SET_SVC
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
-#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
-VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
-#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
-#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
-
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int)
 #define VPX_CTRL_VP8E_SET_CPUUSED
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int)
@@ -986,7 +970,10 @@ VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int)
 #define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD
 VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */
 #define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS
-
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int)
 #define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int)
@@ -997,133 +984,107 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */
 #define VPX_CTRL_VP8E_SET_TUNING
 VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int)
 #define VPX_CTRL_VP8E_SET_CQ_LEVEL
-
-VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int)
-#define VPX_CTRL_VP9E_SET_TILE_COLUMNS
-VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
-#define VPX_CTRL_VP9E_SET_TILE_ROWS
-
-VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
-#define VPX_CTRL_VP9E_SET_TPL
-
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
-#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
-#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
-VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
-#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
-
-VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
-#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
-
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 #define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
+#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
 #define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT
-
-VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
-#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
-
-VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
-#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
 #define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT
-
+VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
+#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
+VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
 #define VPX_CTRL_VP9E_SET_LOSSLESS
-
+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int)
+#define VPX_CTRL_VP9E_SET_TILE_COLUMNS
+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
+#define VPX_CTRL_VP9E_SET_TILE_ROWS
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING
-
 VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
 #define VPX_CTRL_VP9E_SET_AQ_MODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int)
-#define VPX_CTRL_VP9E_SET_ALT_REF_AQ
-
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST
-
 VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int)
 #define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY
-
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
+#define VPX_CTRL_VP9E_SET_SVC
+VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
+#define VPX_CTRL_VP9E_SET_ROI_MAP
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
+#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
 #define VPX_CTRL_VP9E_SET_TUNE_CONTENT
-
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
+VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
+#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_SPACE
-
+VPX_CTRL_USE_TYPE(VP9E_SET_TEMPORAL_LAYERING_MODE,
+                  int) /* VP9E_TEMPORAL_LAYERING_MODE */
+#define VPX_CTRL_VP9E_SET_TEMPORAL_LAYERING_MODE
 VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
-
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
-
 VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP9E_GET_ACTIVEMAP
-
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_RANGE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG
-
 VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
-
 VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int)
 #define VPX_CTRL_VP9E_SET_ROW_MT
-
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
-
-VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *)
-#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL
-
+VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int)
+#define VPX_CTRL_VP9E_SET_ALT_REF_AQ
+VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
 VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int)
 #define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *)
 #define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER
-
 VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int)
 #define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC,
                   vpx_svc_spatial_layer_sync_t *)
 #define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC
-
+VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
+#define VPX_CTRL_VP9E_SET_TPL
 VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int)
 #define VPX_CTRL_VP9E_SET_POSTENCODE_DROP
-
 VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int)
 #define VPX_CTRL_VP9E_SET_DELTA_Q_UV
-
 VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int)
 #define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR
-
 VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int)
 #define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER
-
+VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
+#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
 VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL
-
+VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL
+VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
+#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
 VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
 
-VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
-#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
-
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index dcf7a62860..506a8936be 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -177,26 +177,26 @@ VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *)
 #define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *)
 #define VPX_CTRL_VP8D_GET_LAST_REF_USED
-VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *)
-#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VPXD_SET_DECRYPTOR
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VP8D_SET_DECRYPTOR
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
+#define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_DISPLAY_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
 #define VPX_CTRL_VP9D_GET_BIT_DEPTH
-VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
-#define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int)
 #define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
-VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
-#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
 VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
 #define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
+VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int)
 #define VPX_CTRL_VP9_DECODE_SET_ROW_MT
 VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int)

From fc2a31cfb901e8988dc382f094b843bf9fcd4433 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 1 Feb 2022 11:57:05 -0800
Subject: [PATCH 204/926] vp9_thread_test: parameterize
 VP9DecodeMultiThreadedTest

on a per-file basis; this will make sharding more effective

Change-Id: Ib797681a7cc3bd7ec835bb0c1c7a8d9f23512a0d
---
 test/vp9_thread_test.cc | 158 +++++++++++++++++-----------------------
 1 file changed, 68 insertions(+), 90 deletions(-)

diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 5cac9ea0ee..1ceef8185c 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -148,11 +148,6 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
 #if CONFIG_WEBM_IO
-struct FileList {
-  const char *name;
-  const char *expected_md5;
-};
-
 // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
 string DecodeFile(const string &filename, int num_threads) {
   libvpx_test::WebMVideoSource video(filename);
@@ -182,16 +177,6 @@ string DecodeFile(const string &filename, int num_threads) {
   return string(md5.Get());
 }
 
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != nullptr; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 1; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
-          << "threads = " << t;
-    }
-  }
-}
-
 // Trivial serialized thread worker interface implementation.
 // Note any worker that requires synchronization between other workers will
 // hang.
@@ -228,88 +213,81 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) {
   EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
 }
 
-TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) {
-  // no tiles or frame parallel; this exercises loop filter threading.
-  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
-            DecodeFile("vp90-2-03-size-226x226.webm", 2));
-}
+struct FileParam {
+  const char *name;
+  const char *expected_md5;
+  friend std::ostream &operator<<(std::ostream &os, const FileParam &param) {
+    return os << "file name: " << param.name
+              << " digest: " << param.expected_md5;
+  }
+};
 
-TEST(VP9DecodeMultiThreadedTest, FrameParallel) {
-  static const FileList files[] = { { "vp90-2-08-tile_1x2_frame_parallel.webm",
-                                      "68ede6abd66bae0a2edf2eb9232241b6" },
-                                    { "vp90-2-08-tile_1x4_frame_parallel.webm",
-                                      "368ebc6ebf3a5e478d85b2c3149b2848" },
-                                    { "vp90-2-08-tile_1x8_frame_parallel.webm",
-                                      "17e439da2388aff3a0f69cb22579c6c1" },
-                                    { nullptr, nullptr } };
+class VP9DecodeMultiThreadedTest : public ::testing::TestWithParam<FileParam> {
+};
 
-  DecodeFiles(files);
+TEST_P(VP9DecodeMultiThreadedTest, Decode) {
+  for (int t = 1; t <= 8; ++t) {
+    EXPECT_EQ(GetParam().expected_md5, DecodeFile(GetParam().name, t))
+        << "threads = " << t;
+  }
 }
 
-TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) {
-  static const FileList files[] = {
-    { "vp90-2-14-resize-fp-tiles-1-16.webm",
-      "0cd5e632c326297e975f38949c31ea94" },
-    { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
-      "5c78a96a42e7f4a4f6b2edcdb791e44c" },
-    { "vp90-2-14-resize-fp-tiles-1-2.webm",
-      "e030450ae85c3277be2a418769df98e2" },
-    { "vp90-2-14-resize-fp-tiles-1-4.webm",
-      "312eed4e2b64eb7a4e7f18916606a430" },
-    { "vp90-2-14-resize-fp-tiles-16-1.webm",
-      "1755c16d8af16a9cb3fe7338d90abe52" },
-    { "vp90-2-14-resize-fp-tiles-16-2.webm",
-      "500300592d3fcb6f12fab25e48aaf4df" },
-    { "vp90-2-14-resize-fp-tiles-16-4.webm",
-      "47c48379fa6331215d91c67648e1af6e" },
-    { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
-      "eecf17290739bc708506fa4827665989" },
-    { "vp90-2-14-resize-fp-tiles-16-8.webm",
-      "29b6bb54e4c26b5ca85d5de5fed94e76" },
-    { "vp90-2-14-resize-fp-tiles-1-8.webm",
-      "1b6f175e08cd82cf84bb800ac6d1caa3" },
-    { "vp90-2-14-resize-fp-tiles-2-16.webm",
-      "ca3b03e4197995d8d5444ede7a6c0804" },
-    { "vp90-2-14-resize-fp-tiles-2-1.webm",
-      "99aec065369d70bbb78ccdff65afed3f" },
-    { "vp90-2-14-resize-fp-tiles-2-4.webm",
-      "22d0ebdb49b87d2920a85aea32e1afd5" },
-    { "vp90-2-14-resize-fp-tiles-2-8.webm",
-      "c2115cf051c62e0f7db1d4a783831541" },
-    { "vp90-2-14-resize-fp-tiles-4-16.webm",
-      "c690d7e1719b31367564cac0af0939cb" },
-    { "vp90-2-14-resize-fp-tiles-4-1.webm",
-      "a926020b2cc3e15ad4cc271853a0ff26" },
-    { "vp90-2-14-resize-fp-tiles-4-2.webm",
-      "42699063d9e581f1993d0cf890c2be78" },
-    { "vp90-2-14-resize-fp-tiles-4-8.webm",
-      "7f76d96036382f45121e3d5aa6f8ec52" },
-    { "vp90-2-14-resize-fp-tiles-8-16.webm",
-      "76a43fcdd7e658542913ea43216ec55d" },
-    { "vp90-2-14-resize-fp-tiles-8-1.webm",
-      "8e3fbe89486ca60a59299dea9da91378" },
-    { "vp90-2-14-resize-fp-tiles-8-2.webm",
-      "ae96f21f21b6370cc0125621b441fc52" },
-    { "vp90-2-14-resize-fp-tiles-8-4.webm",
-      "3eb4f24f10640d42218f7fd7b9fd30d4" },
-    { nullptr, nullptr }
-  };
+const FileParam kNoTilesNonFrameParallelFiles[] = {
+  { "vp90-2-03-size-226x226.webm", "b35a1b707b28e82be025d960aba039bc" }
+};
 
-  DecodeFiles(files);
-}
+const FileParam kFrameParallelFiles[] = {
+  { "vp90-2-08-tile_1x2_frame_parallel.webm",
+    "68ede6abd66bae0a2edf2eb9232241b6" },
+  { "vp90-2-08-tile_1x4_frame_parallel.webm",
+    "368ebc6ebf3a5e478d85b2c3149b2848" },
+  { "vp90-2-08-tile_1x8_frame_parallel.webm",
+    "17e439da2388aff3a0f69cb22579c6c1" },
+};
 
-TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) {
-  static const FileList files[] = {
-    { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
-    { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
-    { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
-    { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
-    { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
-    { nullptr, nullptr }
-  };
+const FileParam kFrameParallelResizeFiles[] = {
+  { "vp90-2-14-resize-fp-tiles-1-16.webm", "0cd5e632c326297e975f38949c31ea94" },
+  { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
+    "5c78a96a42e7f4a4f6b2edcdb791e44c" },
+  { "vp90-2-14-resize-fp-tiles-1-2.webm", "e030450ae85c3277be2a418769df98e2" },
+  { "vp90-2-14-resize-fp-tiles-1-4.webm", "312eed4e2b64eb7a4e7f18916606a430" },
+  { "vp90-2-14-resize-fp-tiles-16-1.webm", "1755c16d8af16a9cb3fe7338d90abe52" },
+  { "vp90-2-14-resize-fp-tiles-16-2.webm", "500300592d3fcb6f12fab25e48aaf4df" },
+  { "vp90-2-14-resize-fp-tiles-16-4.webm", "47c48379fa6331215d91c67648e1af6e" },
+  { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
+    "eecf17290739bc708506fa4827665989" },
+  { "vp90-2-14-resize-fp-tiles-16-8.webm", "29b6bb54e4c26b5ca85d5de5fed94e76" },
+  { "vp90-2-14-resize-fp-tiles-1-8.webm", "1b6f175e08cd82cf84bb800ac6d1caa3" },
+  { "vp90-2-14-resize-fp-tiles-2-16.webm", "ca3b03e4197995d8d5444ede7a6c0804" },
+  { "vp90-2-14-resize-fp-tiles-2-1.webm", "99aec065369d70bbb78ccdff65afed3f" },
+  { "vp90-2-14-resize-fp-tiles-2-4.webm", "22d0ebdb49b87d2920a85aea32e1afd5" },
+  { "vp90-2-14-resize-fp-tiles-2-8.webm", "c2115cf051c62e0f7db1d4a783831541" },
+  { "vp90-2-14-resize-fp-tiles-4-16.webm", "c690d7e1719b31367564cac0af0939cb" },
+  { "vp90-2-14-resize-fp-tiles-4-1.webm", "a926020b2cc3e15ad4cc271853a0ff26" },
+  { "vp90-2-14-resize-fp-tiles-4-2.webm", "42699063d9e581f1993d0cf890c2be78" },
+  { "vp90-2-14-resize-fp-tiles-4-8.webm", "7f76d96036382f45121e3d5aa6f8ec52" },
+  { "vp90-2-14-resize-fp-tiles-8-16.webm", "76a43fcdd7e658542913ea43216ec55d" },
+  { "vp90-2-14-resize-fp-tiles-8-1.webm", "8e3fbe89486ca60a59299dea9da91378" },
+  { "vp90-2-14-resize-fp-tiles-8-2.webm", "ae96f21f21b6370cc0125621b441fc52" },
+  { "vp90-2-14-resize-fp-tiles-8-4.webm", "3eb4f24f10640d42218f7fd7b9fd30d4" },
+};
 
-  DecodeFiles(files);
-}
+const FileParam kNonFrameParallelFiles[] = {
+  { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
+  { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
+  { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
+  { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
+  { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
+};
+
+INSTANTIATE_TEST_SUITE_P(NoTilesNonFrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kNoTilesNonFrameParallelFiles));
+INSTANTIATE_TEST_SUITE_P(FrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kFrameParallelFiles));
+INSTANTIATE_TEST_SUITE_P(FrameParallelResize, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kFrameParallelResizeFiles));
+INSTANTIATE_TEST_SUITE_P(NonFrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kNonFrameParallelFiles));
 #endif  // CONFIG_WEBM_IO
 
 INSTANTIATE_TEST_SUITE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());

From 847a0ef84f4d5bc3c28124742fe47bb277a1f5fe Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 1 Feb 2022 16:24:07 -0800
Subject: [PATCH 205/926] vp9_roi_test: apply iwyu

Change-Id: I715c27e329495940d989f95df65ac10e021261d2
---
 test/vp9_roi_test.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc
index 52dfd9e029..e8373c4c0b 100644
--- a/test/vp9_roi_test.cc
+++ b/test/vp9_roi_test.cc
@@ -8,7 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "memory"
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -16,8 +18,11 @@
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "test/video_source.h"
 #include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
 
 #define MASK_WIDTH 40
 #define MASK_HEIGHT 30

From 74c0f504c4187fd0f923209767c530776588728d Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 2 Feb 2022 16:17:58 -0800
Subject: [PATCH 206/926] rtc-vp9: Fix to tests for intra-only frame.

Fix some issues with the test, and add new
test that verifies that we can decode base stream
startinig at middle of sequence where intra-only
frame is inserted.

Change-Id: I398d23927113eb58ef64694feca25e60ce60a5f7
---
 test/svc_end_to_end_test.cc | 54 +++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index 518824d03f..e59e337f1b 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -171,9 +171,14 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
         decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
                          decode_to_layer_before_sync_);
     } else {
-      if (decode_to_layer_after_sync_ >= 0)
-        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
-                         decode_to_layer_after_sync_);
+      if (decode_to_layer_after_sync_ >= 0) {
+        int decode_to_layer = decode_to_layer_after_sync_;
+        // Overlay frame is additional layer for intra-only.
+        if (video->frame() == frame_to_sync_ && intra_only_test_ &&
+            decode_to_layer_after_sync_ == 0 && number_spatial_layers_ > 1)
+          decode_to_layer += 1;
+        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, decode_to_layer);
+      }
     }
   }
 #endif
@@ -246,7 +251,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
       cfg_.temporal_layering_mode = 2;
     } else if (num_temporal_layer == 1) {
       cfg_.ts_rate_decimator[0] = 1;
-      cfg_.temporal_layering_mode = 1;
+      cfg_.temporal_layering_mode = 0;
     }
   }
 };
@@ -390,6 +395,37 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) {
 }
 #endif
 
+// Encode 3 spatial, 3 temporal layer but don't start decoding.
+// During the sequence insert intra-only on base/qvga layer at frame 20
+// and start decoding only QVGA layer from there.
+TEST_P(SyncFrameOnePassCbrSvc,
+       OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGA) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  if (0 && decode_to_layer_before_sync_ == decode_to_layer_after_sync_) {
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  }
+#endif
+}
+
 // Start decoding from beginning of sequence, during sequence insert intra-only
 // on base/qvga layer. Decode all layers.
 TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
@@ -397,8 +433,9 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
   frame_to_start_decode_ = 0;
   frame_to_sync_ = 20;
   decode_to_layer_before_sync_ = 2;
-  // The superframe containing intra-only layer will have 4 frames. Thus set the
-  // layer to decode after sync frame to 3.
+  // The superframe containing intra-only layer will have +1 frames. Thus set
+  // the layer to decode after sync frame to +1 from
+  // decode_to_layer_before_sync.
   decode_to_layer_after_sync_ = 3;
   intra_only_test_ = true;
 
@@ -426,8 +463,9 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) {
   frame_to_start_decode_ = 0;
   frame_to_sync_ = 20;
   decode_to_layer_before_sync_ = 2;
-  // The superframe containing intra-only layer will have 4 frames. Thus set the
-  // layer to decode after sync frame to 3.
+  // The superframe containing intra-only layer will have +1 frames. Thus set
+  // the layer to decode after sync frame to +1 from
+  // decode_to_layer_before_sync.
   decode_to_layer_after_sync_ = 3;
   intra_only_test_ = true;
 

From e2cc35cb673a65ffa14bd36f62390aa221c54393 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 5 Feb 2022 12:14:37 -0800
Subject: [PATCH 207/926] Update error messages in validate_img()

Change-Id: I4aa6d2e16e077d29e4e9eabfc7056fcfed6786d6
---
 vp9/vp9_cx_iface.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 9f03ed1728..76274437c6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -380,8 +380,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
     case VPX_IMG_FMT_I440:
       if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
         ERROR(
-            "Invalid image format. I422, I444, I440, NV12 images are "
-            "not supported in profile.");
+            "Invalid image format. I422, I444, I440 images are not supported "
+            "in profile.");
       }
       break;
     case VPX_IMG_FMT_I42216:
@@ -396,8 +396,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
       break;
     default:
       ERROR(
-          "Invalid image format. Only YV12, I420, I422, I444 images are "
-          "supported.");
+          "Invalid image format. Only YV12, I420, I422, I444, I440, NV12 "
+          "images are supported.");
       break;
   }
 

From b22edeb26b8b47155ee94f2d9093cf7cf108bf07 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 5 Feb 2022 14:32:49 -0800
Subject: [PATCH 208/926] Handle NV12 in vpx_img_chroma_subsampling()

Change-Id: Ibac9f6f8dcdcae0d0c10ae1a118d13baf2407270
---
 vp9/encoder/vp9_encoder.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8711fa2c07..4609a6bb26 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2219,6 +2219,7 @@ static INLINE void vpx_img_chroma_subsampling(vpx_img_fmt_t fmt,
   switch (fmt) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_NV12:
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I42216: *subsampling_x = 1; break;
@@ -2229,6 +2230,7 @@ static INLINE void vpx_img_chroma_subsampling(vpx_img_fmt_t fmt,
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_I440:
     case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_NV12:
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I44016: *subsampling_y = 1; break;
     default: *subsampling_y = 0; break;

From 85a9bdc6cc0ab6be4a2fb2c93f9e1551688489f6 Mon Sep 17 00:00:00 2001
From: Lu Wang <wanglu@loongson.cn>
Date: Wed, 10 Nov 2021 15:05:42 +0800
Subject: [PATCH 209/926] vpx_util[loongarch]: Add loongson_intrinsics.h
 v1.0.5.

Bug: webm:1755

Change-Id: Id2fa999bdb8788bd4285114c748c547fa262a95e
---
 vpx_util/loongson_intrinsics.h | 1869 ++++++++++++++++++++++++++++++++
 1 file changed, 1869 insertions(+)
 create mode 100644 vpx_util/loongson_intrinsics.h

diff --git a/vpx_util/loongson_intrinsics.h b/vpx_util/loongson_intrinsics.h
new file mode 100644
index 0000000000..a34b6e8b44
--- /dev/null
+++ b/vpx_util/loongson_intrinsics.h
@@ -0,0 +1,1869 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+#define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei Gu   <guxiwei-hf@loongson.cn>
+ *                Lu Wang    <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extension.
+ *
+ */
+
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 0
+#define LSOM_VERSION_MICRO 5
+
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+  {                                               \
+    _OUT0 = _INS(_IN0);                           \
+    _OUT1 = _INS(_IN1);                           \
+  }
+
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+  {                                                           \
+    _OUT0 = _INS(_IN0, _IN1);                                 \
+    _OUT1 = _INS(_IN2, _IN3);                                 \
+  }
+
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+  {                                                                       \
+    _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
+    _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
+  }
+
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+  {                                                                         \
+    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
+    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
+  }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+                  _OUT1, _OUT2, _OUT3)                                         \
+  {                                                                            \
+    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
+    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
+  }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+                  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
+  {                                                                           \
+    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
+    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
+  }
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               Then the results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               The results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
+                                         __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half-word vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - __m128i
+ * Details     : Signed half-word elements from in_h are multiplied by
+ *               signed half-word elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               Then the results plus to signed word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ *         out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_w_h(in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ *               (_in))
+ * Arguments   : Inputs  - _in  (input vector)
+ *                       - min  (min threshold)
+ *                       - max  (max threshold)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lsx_vclip_h(_in)
+ *         _in : -8,2,280,249, -8,255,280,249
+ *         min : 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+  __m128i out;
+
+  out = __lsx_vmax_h(min, _in);
+  out = __lsx_vmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_h(_in)
+ *         _in : -8,255,280,249, -8,255,280,249
+ *         out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_h(_in, 0);
+  out = __lsx_vsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - word
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_w(_in)
+ *         _in : -8,255,280,249
+ *         out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_w(_in, 0);
+  out = __lsx_vsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments   : Inputs  - _in0, _in1
+ *               Outputs - _in0, _in1 (in-place)
+ * Details     : Swapping of two input variables using xor
+ * Example     : LSX_SWAP(_in0, _in1)
+ *        _in0 : 1,2,3,4
+ *        _in1 : 5,6,7,8
+ *   _in0(out) : 5,6,7,8
+ *   _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1)         \
+  {                                  \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+    _in1 = __lsx_vxor_v(_in0, _in1); \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _t0, _t1, _t2, _t3;                                                \
+                                                                               \
+    _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
+    _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
+    _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
+    _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
+    _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
+    _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
+    _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
+    _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Details     : The rows of the matrix become columns, and the columns
+ *               become rows.
+ * Example     : LSX_TRANSPOSE8x8_B
+ *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i zero = { 0 };                                                   \
+    __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
+                                                                            \
+    _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
+    _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
+    _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
+    _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
+    _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
+    _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
+    _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
+    _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
+    _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
+    _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
+    _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
+    _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
+    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
+    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
+    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
+    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
+ *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
+ *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
+ *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
+ *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
+ *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
+ *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
+ *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
+                                                                            \
+    _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
+    _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
+    _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
+    _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
+                                                                            \
+    _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
+    _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
+    _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
+    _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
+    _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
+    _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
+    _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
+    _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
+ *               Return Type - as per RTYPE
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LSX_TRANSPOSE8x4_B
+ *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+                           _out0, _out1, _out2, _out3)                     \
+  {                                                                        \
+    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
+                                                                           \
+    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
+    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
+                                                                           \
+    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
+    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
+                                                                           \
+    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
+    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
+    _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
+    _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ *                         in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              000,001,002,003,004,005,006,007
+ *              008,009,010,011,012,013,014,015
+ *              016,017,018,019,020,021,022,023
+ *              024,025,026,027,028,029,030,031
+ *              032,033,034,035,036,037,038,039
+ *              040,041,042,043,044,045,046,047        000,008,...,112,120
+ *              048,049,050,051,052,053,054,055        001,009,...,113,121
+ *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
+ *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ *              072,073,074,075,076,077,078,079        004,012,...,116,124
+ *              080,081,082,083,084,085,086,087        005,013,...,117,125
+ *              088,089,090,091,092,093,094,095        006,014,...,118,126
+ *              096,097,098,099,100,101,102,103        007,015,...,119,127
+ *              104,105,106,107,108,109,110,111
+ *              112,113,114,115,116,117,118,119
+ *              120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                            _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                            _out6, _out7)                                    \
+  {                                                                          \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
+    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+              _tmp0, _tmp1, _tmp2, _tmp3);                                   \
+    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
+              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
+    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
+    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Butterfly operation
+ * Example     :
+ *               out0 = in0 + in3;
+ *               out1 = in1 + in2;
+ *               out2 = in1 - in2;
+ *               out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_b(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_b(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_b(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_h(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_h(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_h(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_w(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_w(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_w(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_d(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_d(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_d(_in0, _in3);                                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in7;
+ *              _out1 = _in1 + _in6;
+ *              _out2 = _in2 + _in5;
+ *              _out3 = _in3 + _in4;
+ *              _out4 = _in3 - _in4;
+ *              _out5 = _in2 - _in5;
+ *              _out6 = _in1 - _in6;
+ *              _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_b(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_b(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_b(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_b(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_b(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_b(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_b(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_b(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_h(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_h(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_h(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_h(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_h(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_h(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_h(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_h(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_w(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_w(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_w(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_w(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_w(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_w(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_w(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_w(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_d(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_d(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_d(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_d(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_d(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_d(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_d(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_d(_in0, _in7);                                      \
+  }
+
+#endif  // LSX
+
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplication results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_b(in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed double
+ * Details     : Signed word elements from in_h are multiplied with
+ *               signed word elements from in_l producing a result
+ *               twice the size of input i.e. signed double-word.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_d_w(in_h, in_l);
+  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. unsigned word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - per RTYPE
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               unsigned halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  out = __lasx_xvsub_h(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               Signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ *        in_c : 0,0,0,0, 0,0,0,0
+ *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *         out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvsub_w(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               four times the size of input i.e. signed doubleword.
+ *               Then this multiplication results of four adjacent elements
+ *               are added together and stored to the out vector.
+ * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
+ *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ *         out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvhaddw_d_w(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The out vector and the out vector are added after the
+ *               lower half of the two-fold zero extension (unsigned byte
+ *               to unsigned halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_hu_bu(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double zero extension (unsigned byte to
+ *               signed halfword)，added to the in_h vector.
+ * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_hu_bu(in_l, 0);
+  out = __lasx_xvadd_h(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double sign extension (signed halfword to
+ *               signed word), added to the in_h vector.
+ * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ *        in_h : 0, 1,0,0, -1,0,0,1,
+ *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
+ *         out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvadd_w(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the lower half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed halfword
+ *               to signed word), and the result is added to the vector in_c,
+ *               then stored to the out vector.
+ * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 5,6,7,8
+ *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
+ *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the higher half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the higher half of the two-fold sign extension (signed
+ *               halfword to signed word), and the result is added to
+ *               the vector in_c, then stored to the out vector.
+ * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvmul_w(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are added to the high half
+ *               after being doubled, then saturated.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector adds the in_l vector after the lower half of
+ *               the two-fold zero extension (unsigned byte to unsigned
+ *               halfword) and then saturated. The results are stored to the out
+ *               vector.
+ * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ *               0,0,0,1
+ *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+  __m256i tmp1, out;
+  __m256i zero = { 0 };
+
+  tmp1 = __lasx_xvilvl_b(zero, in_l);
+  out = __lasx_xvsadd_hu(in_h, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments   : Inputs  - in    (input vector)
+ *                       - min   (min threshold)
+ *                       - max   (max threshold)
+ *               Outputs - in    (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lasx_xvclip_h(in, min, max)
+ *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+  __m256i out;
+
+  out = __lasx_xvmax_h(min, in);
+  out = __lasx_xvmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs  - in   (input vector)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_h(in, 0);
+  out = __lasx_xvsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs - in   (input vector)
+ *               Output - out  (output vector with clipped elements)
+ *               Return Type - signed word
+ * Example     : out = __lasx_xvclip255_w(in)
+ *          in : -8,255,280,249, -8,255,280,249
+ *         out :  0,255,255,249,  0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_w(in, 0);
+  out = __lasx_xvsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_l_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ *         idx : 0x02
+ *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x02);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_h_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ *         idx : 0x09
+ *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x13);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
+    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
+    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
+    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
+    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
+    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
+    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
+    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
+    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
+    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
+    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
+    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
+    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
+    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
+    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
+    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
+    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
+    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
+    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
+    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
+    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
+    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
+    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
+    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE16x8_H
+ *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+                                                                              \
+    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ *               Return Type - signed halfword
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _s0_m, _s1_m;                                                \
+                                                                         \
+    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
+    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
+    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
+    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
+    _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
+    _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *                         (input 8x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x8 byte block)
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
+    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
+    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
+    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
+    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
+    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
+    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
+    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
+    _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
+    _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
+    _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
+    _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments   : Inputs  - _in0, _in1, ~
+ *               Outputs - _out0, _out1, ~
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE8x8_H
+ *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
+    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
+    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
+    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
+    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
+    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
+    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
+    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
+    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
+    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
+    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
+    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
+  }
+
+#endif  // LASX
+
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
+ *               Outputs -
+ * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ *               '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ *               VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
+  {                                                                \
+    RTYPE _tmp0 = (RTYPE)in0;                                      \
+    int _i = 0;                                                    \
+    if (enter) printf("\nVP:");                                    \
+    for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
+  }
+
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */

From b3cc4b625d1d2c9a0913dcfbda97dd3bf845f998 Mon Sep 17 00:00:00 2001
From: Lu Wang <wanglu@loongson.cn>
Date: Wed, 10 Nov 2021 15:21:17 +0800
Subject: [PATCH 210/926] vp8[loongarch]: Optimize vp8_loop/sixtap, vpx_dc with
 LSX.

1. vp8_loop_filter_mbh, vp8_loop_filter_mbv
2. vp8_sixtap_predict16x16, vp8_sixtap_predict8x8
3. vpx_dc_predictor_16x16, vpx_dc_predictor_8x8

./vpxdec --progress -o YUV_1920X1080.yuv original_1200f/VP8_1920X1080.webm

before: 37.77fps
after : 220.90fps

Bug: webm:1755

Change-Id: I1a3ce16f0c872261d813b6531cfdf25bd59bb774
---
 vp8/common/loongarch/loopfilter_filters_lsx.c |  393 ++++++
 vp8/common/loongarch/sixtap_filter_lsx.c      | 1164 +++++++++++++++++
 vp8/common/rtcd_defs.pl                       |    8 +-
 vp8/vp8_common.mk                             |    4 +
 vpx_dsp/loongarch/intrapred_lsx.c             |   98 ++
 vpx_dsp/vpx_dsp.mk                            |    1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl                  |    4 +-
 7 files changed, 1666 insertions(+), 6 deletions(-)
 create mode 100644 vp8/common/loongarch/loopfilter_filters_lsx.c
 create mode 100644 vp8/common/loongarch/sixtap_filter_lsx.c
 create mode 100644 vpx_dsp/loongarch/intrapred_lsx.c

diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c
new file mode 100644
index 0000000000..484b3d6ad0
--- /dev/null
+++ b/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
+  {                                                     \
+    __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;         \
+    __m128i u, filt, t1, t2, filt_sign, q0_sub_p0;      \
+    __m128i filt_r, filt_l;                             \
+    __m128i temp0, temp1, temp2, temp3;                 \
+    const __m128i cnst4b = __lsx_vldi(4);               \
+    const __m128i cnst3b = __lsx_vldi(3);               \
+    const __m128i cnst9h = __lsx_vldi(1033);            \
+    const __m128i cnst63h = __lsx_vldi(1087);           \
+                                                        \
+    p2_m = __lsx_vxori_b(p2, 0x80);                     \
+    p1_m = __lsx_vxori_b(p1, 0x80);                     \
+    p0_m = __lsx_vxori_b(p0, 0x80);                     \
+    q0_m = __lsx_vxori_b(q0, 0x80);                     \
+    q1_m = __lsx_vxori_b(q1, 0x80);                     \
+    q2_m = __lsx_vxori_b(q2, 0x80);                     \
+                                                        \
+    filt = __lsx_vssub_b(p1_m, q1_m);                   \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vand_v(filt, mask);                    \
+                                                        \
+    t2 = __lsx_vand_v(filt, hev);                       \
+    hev = __lsx_vxori_b(hev, 0xff);                     \
+    filt = __lsx_vand_v(hev, filt);                     \
+    t1 = __lsx_vsadd_b(t2, cnst4b);                     \
+    t1 = __lsx_vsra_b(t1, cnst3b);                      \
+    t2 = __lsx_vsadd_b(t2, cnst3b);                     \
+    t2 = __lsx_vsra_b(t2, cnst3b);                      \
+    q0_m = __lsx_vssub_b(q0_m, t1);                     \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                     \
+    filt_sign = __lsx_vslti_b(filt, 0);                 \
+    filt_r = __lsx_vilvl_b(filt_sign, filt);            \
+    filt_l = __lsx_vilvh_b(filt_sign, filt);            \
+    temp0 = __lsx_vmul_h(filt_r, cnst9h);               \
+    temp1 = __lsx_vadd_h(temp0, cnst63h);               \
+    temp2 = __lsx_vmul_h(filt_l, cnst9h);               \
+    temp3 = __lsx_vadd_h(temp2, cnst63h);               \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q2_m = __lsx_vssub_b(q2_m, u);                      \
+    p2_m = __lsx_vsadd_b(p2_m, u);                      \
+    q2 = __lsx_vxori_b(q2_m, 0x80);                     \
+    p2 = __lsx_vxori_b(p2_m, 0x80);                     \
+                                                        \
+    temp1 = __lsx_vadd_h(temp1, temp0);                 \
+    temp3 = __lsx_vadd_h(temp3, temp2);                 \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q1_m = __lsx_vssub_b(q1_m, u);                      \
+    p1_m = __lsx_vsadd_b(p1_m, u);                      \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                     \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                     \
+                                                        \
+    temp1 = __lsx_vadd_h(temp1, temp0);                 \
+    temp3 = __lsx_vadd_h(temp3, temp2);                 \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q0_m = __lsx_vssub_b(q0_m, u);                      \
+    p0_m = __lsx_vsadd_b(p0_m, u);                      \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                     \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                     \
+  }
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  {                                                                          \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
+                                                                             \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
+    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
+    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
+    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
+    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
+    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
+  }
+
+#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \
+  {                                                           \
+    __lsx_vstelm_w(in0, pdst, 0, in0_idx);                    \
+    __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx);           \
+  }
+
+static inline void mbloop_filter_horizontal_edge_y_lsx(
+    uint8_t *src, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  temp_src = src - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3, p2, p1, p0);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  temp_src = src - pitch_x3;
+  __lsx_vstx(p2, temp_src, 0);
+  __lsx_vstx(p1, temp_src, pitch);
+  __lsx_vstx(p0, temp_src, pitch_x2);
+  __lsx_vstx(q0, temp_src, pitch_x3);
+  temp_src += pitch_x4;
+  __lsx_vstx(q1, temp_src, 0);
+  __lsx_vstx(q2, temp_src, pitch);
+}
+
+static inline void mbloop_filter_horizontal_edge_uv_lsx(
+    uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  temp_src = src_u - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u);
+  temp_src = src_v - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v);
+
+  DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+            q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  src_u -= pitch_x3;
+  __lsx_vstelm_d(p2, src_u, 0, 0);
+  __lsx_vstelm_d(p1, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0);
+  __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0);
+  src_u += pitch_x4;
+  __lsx_vstelm_d(q1, src_u, 0, 0);
+  src_u += pitch;
+  __lsx_vstelm_d(q2, src_u, 0, 0);
+
+  src_v -= pitch_x3;
+  __lsx_vstelm_d(p2, src_v, 0, 1);
+  __lsx_vstelm_d(p1, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1);
+  __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1);
+  src_v += pitch_x4;
+  __lsx_vstelm_d(q1, src_v, 0, 1);
+  src_v += pitch;
+  __lsx_vstelm_d(q2, src_v, 0, 1);
+}
+
+static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src,
+                                                     int32_t pitch,
+                                                     const uint8_t b_limit_in,
+                                                     const uint8_t limit_in,
+                                                     const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+  temp_src = src - 4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row0, row1, row2, row3);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row4, row5, row6, row7);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row8, row9, row10, row11);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row12, row13, row14, row15);
+  temp_src -= pitch_x4;
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+  tmp2 = __lsx_vilvl_b(q2, q1);
+  tmp5 = __lsx_vilvh_b(q2, q1);
+
+  temp_src = src - 3;
+  VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static inline void mbloop_filter_vertical_edge_uv_lsx(
+    uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  src_u -= 4;
+  DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+            pitch_x3, row0, row1, row2, row3);
+  src_u += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+            pitch_x3, row4, row5, row6, row7);
+  src_v -= 4;
+  DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+            pitch_x3, row8, row9, row10, row11);
+  src_v += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+            pitch_x3, row12, row13, row14, row15);
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+  tmp2 = __lsx_vilvl_b(q2, q1);
+  tmp5 = __lsx_vilvh_b(q2, q1);
+
+  src_u += 1 - pitch_x4;
+  VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4);
+
+  src_v += 1 - pitch_x4;
+  VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                             int32_t pitch_y, int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr) {
+  mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+                                      *lpf_info_ptr->lim,
+                                      *lpf_info_ptr->hev_thr);
+  if (src_u) {
+    mbloop_filter_horizontal_edge_uv_lsx(
+        src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+        *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                             int32_t pitch_y, int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr) {
+  mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+                                    *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+  if (src_u) {
+    mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v,
+                                       *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+                                       *lpf_info_ptr->hev_thr);
+  }
+}
diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c
new file mode 100644
index 0000000000..75fe533d98
--- /dev/null
+++ b/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = {
+  { 0, -6, 123, 12, -1, 0, 0, 0 },
+  { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
+  { 0, -9, 93, 50, -6, 0, 0, 0 },
+  { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
+  { 0, -6, 50, 93, -9, 0, 0, 0 },
+  { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
+  { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+#define DPADD_H3(in0, in1, in2, coeff0, coeff1, coeff2) \
+  ({                                                    \
+    __m128i out0_m;                                     \
+                                                        \
+    out0_m = __lsx_vdp2_h_b(in0, coeff0);               \
+    out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);    \
+    out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);    \
+                                                        \
+    out0_m;                                             \
+  })
+
+#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1,  \
+                        filt_h2)                                            \
+  ({                                                                        \
+    __m128i vec0_m, vec1_m, vec2_m;                                         \
+    __m128i hz_out_m;                                                       \
+                                                                            \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src1, mask0, src0, src1, mask1, vec0_m,  \
+              vec1_m);                                                      \
+    vec2_m = __lsx_vshuf_b(src0, src1, mask2);                              \
+    hz_out_m = DPADD_H3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \
+                                                                            \
+    hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
+    hz_out_m = __lsx_vsat_h(hz_out_m, 7);                                   \
+                                                                            \
+    hz_out_m;                                                               \
+  })
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   mask2, filt0, filt1, filt2, out0, out1,  \
+                                   out2, out3)                              \
+  ({                                                                        \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+                                                                            \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m,  \
+              vec1_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m,  \
+              vec3_m);                                                      \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0,  \
+              vec3_m, filt0, out0, out1, out2, out3);                       \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m,  \
+              vec1_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m,  \
+              vec3_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m,  \
+              vec5_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m,  \
+              vec7_m);                                                      \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1,  \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,   \
+              out3);                                                        \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2,  \
+              out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2,   \
+              out3);                                                        \
+  })
+
+#define FILT_4TAP_DPADD_H(vec0, vec1, filt0, filt1) \
+  ({                                                \
+    __m128i tmp0;                                   \
+                                                    \
+    tmp0 = __lsx_vdp2_h_b(vec0, filt0);             \
+    tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1);    \
+                                                    \
+    tmp0;                                           \
+  })
+
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)        \
+  ({                                                                       \
+    __m128i vec0_m, vec1_m;                                                \
+    __m128i hz_out_m;                                                      \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src1, mask0, src0, src1, mask1, vec0_m, \
+              vec1_m);                                                     \
+    hz_out_m = FILT_4TAP_DPADD_H(vec0_m, vec1_m, filt_h0, filt_h1);        \
+    hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);                 \
+    hz_out_m = __lsx_vsat_h(hz_out_m, 7);                                  \
+                                                                           \
+    hz_out_m;                                                              \
+  })
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1, out2, out3)   \
+  ({                                                                       \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
+              vec3_m);                                                     \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
+              vec3_m, filt0, out0, out1, out2, out3);                      \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,  \
+              out3);                                                       \
+  })
+
+static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, tmp0, tmp1;
+  __m128i filt, out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 2;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src += src_stride_x4;
+  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1, out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+  dst += dst_stride_x4;
+
+  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    src += src_stride_x4;
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out;
+  __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 2;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src2, src4, src6);
+    src += 8;
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src1, src3, src5, src7);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+              src5, src6, src7);
+    src += src_stride_x4 - 8;
+
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out4, out5, out6, out7);
+    DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT,
+              out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2,
+              out3);
+    DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT,
+              out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6,
+              out7);
+    DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1,
+              out2, out3);
+    DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5,
+              out6, out7);
+    out = __lsx_vpickev_b(out1, out0);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vst(out, dst, 0);
+    out = __lsx_vpickev_b(out3, out2);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride);
+    out = __lsx_vpickev_b(out5, out4);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride_x2);
+    out = __lsx_vpickev_b(out7, out6);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
+  __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+  __m128i src109_r, filt0, filt1, filt2;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= src_stride_x2;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3,
+            src10_r, src32_r, src21_r, src43_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src7, src8, src9, src10);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
+              src76_r, src87_r, src98_r, src109_r);
+    out0_r = DPADD_H3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = DPADD_H3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = DPADD_H3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = DPADD_H3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+              out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    src10_r = src76_r;
+    src32_r = src98_r;
+    src21_r = src87_r;
+    src43_r = src109_r;
+    src4 = src10;
+  }
+}
+
+static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+  __m128i src65_l, src87_l, filt0, filt1, filt2;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= src_stride_x2;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vldx(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
+            src10_r, src32_r, src43_r, src21_r);
+  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
+            src10_l, src32_l, src43_l, src21_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_r, src65_r, src76_r, src87_r);
+    DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_l, src65_l, src76_l, src87_l);
+    out0_r = DPADD_H3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    out1_r = DPADD_H3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    out2_r = DPADD_H3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    out3_r = DPADD_H3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    out0_l = DPADD_H3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    out1_l = DPADD_H3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    out2_l = DPADD_H3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    out3_l = DPADD_H3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+              out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+              out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+              tmp1, tmp2, tmp3);
+    __lsx_vstx(tmp0, dst, 0);
+    __lsx_vstx(tmp1, dst, dst_stride);
+    __lsx_vstx(tmp2, dst, dst_stride_x2);
+    __lsx_vstx(tmp3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src4 = src8;
+  }
+}
+
+static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, filt_hz2;
+  __m128i mask0, mask1, mask2, vec0, vec1;
+  __m128i filt, filt_vt0, filt_vt1, filt_vt2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= (2 + src_stride_x2);
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  filt_hz2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vldx(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+
+  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+  filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2,
+            hz_out1, hz_out4, hz_out3, out0, out1, out3, out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+    tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out7 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp2 = DPADD_H3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out6 = __lsx_vpackev_b(hz_out8, hz_out7);
+    tmp3 = DPADD_H3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
+              VP8_FILTER_SHIFT, vec0, vec1);
+    DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    hz_out4 = hz_out8;
+    out0 = out2;
+    out1 = out7;
+    out3 = out5;
+    out4 = out6;
+  }
+}
+
+static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src1, src2, src3);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i filt0, filt1, mask0, mask1;
+  __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src2, src4, src6);
+    src += 8;
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src1, src3, src5, src7);
+    src += src_stride_x4 - 8;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+              src5, src6, src7);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                               filt1, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6,
+              VP8_FILTER_SHIFT, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0,
+              out1, out2, out3);
+    __lsx_vstx(out0, dst, 0);
+    __lsx_vstx(out1, dst, dst_stride);
+    __lsx_vstx(out2, dst, dst_stride_x2);
+    __lsx_vstx(out3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src7, src8, src9, src10;
+  __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= src_stride;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src7, src8, src9, src10);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
+              src72_r, src87_r, src98_r, src109_r);
+    out0_r = FILT_4TAP_DPADD_H(src10_r, src72_r, filt0, filt1);
+    out1_r = FILT_4TAP_DPADD_H(src21_r, src87_r, filt0, filt1);
+    out2_r = FILT_4TAP_DPADD_H(src72_r, src98_r, filt0, filt1);
+    out3_r = FILT_4TAP_DPADD_H(src87_r, src109_r, filt0, filt1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+              out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    src10_r = src98_r;
+    src21_r = src109_r;
+    src2 = src10;
+  }
+}
+
+static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+  __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= src_stride;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+    DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5,
+              src32_r, src43_r, src54_r, src65_r);
+    DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
+              src32_l, src43_l, src54_l, src65_l);
+    out0_r = FILT_4TAP_DPADD_H(src10_r, src32_r, filt0, filt1);
+    out1_r = FILT_4TAP_DPADD_H(src21_r, src43_r, filt0, filt1);
+    out2_r = FILT_4TAP_DPADD_H(src32_r, src54_r, filt0, filt1);
+    out3_r = FILT_4TAP_DPADD_H(src43_r, src65_r, filt0, filt1);
+    out0_l = FILT_4TAP_DPADD_H(src10_l, src32_l, filt0, filt1);
+    out1_l = FILT_4TAP_DPADD_H(src21_l, src43_l, filt0, filt1);
+    out2_l = FILT_4TAP_DPADD_H(src32_l, src54_l, filt0, filt1);
+    out3_l = FILT_4TAP_DPADD_H(src43_l, src65_l, filt0, filt1);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+              out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+              out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+              tmp1, tmp2, tmp3);
+    __lsx_vstx(tmp0, dst, 0);
+    __lsx_vstx(tmp1, dst, dst_stride);
+    __lsx_vstx(tmp2, dst, dst_stride_x2);
+    __lsx_vstx(tmp3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+
+    src10_r = src54_r;
+    src21_r = src65_r;
+    src10_l = src54_l;
+    src21_l = src65_l;
+    src2 = src6;
+  }
+}
+
+static inline void common_hv_4ht_4vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+  __m128i mask0, mask1, out0, out1;
+  __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3;
+  __m128i vec0, vec1, vec2, vec3, vec4;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1 + src_stride;
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+    tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1);
+
+    hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = FILT_4TAP_DPADD_H(vec1, vec4, filt_vt0, filt_vt1);
+
+    hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
+    tmp3 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    vec0 = vec4;
+    vec2 = vec1;
+  }
+}
+
+static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static inline void common_hv_6ht_4vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+  __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+  __m128i out0, out1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= (2 + src_stride);
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  filt_hz2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+
+    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+    tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1);
+
+    hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = FILT_4TAP_DPADD_H(vec1, vec0, filt_vt0, filt_vt1);
+
+    hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
+    tmp3 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static inline void common_hv_4ht_6vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, mask0, mask1;
+  __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i vec0, vec1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1 + src_stride_x2;
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+  filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+    tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+    out6 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp2 = DPADD_H3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+    out7 = __lsx_vpackev_b(hz_out8, hz_out7);
+    tmp3 = DPADD_H3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
+    DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+    hz_out4 = hz_out8;
+    out0 = out2;
+    out1 = out6;
+    out3 = out5;
+    out4 = out7;
+  }
+}
+
+static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+typedef void (*PVp8SixtapPredictFunc1)(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height);
+
+typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src,
+                                       int32_t src_stride,
+                                       uint8_t *RESTRICT dst,
+                                       int32_t dst_stride, const int8_t *filter,
+                                       int32_t height);
+
+void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = {
+    common_hv_6ht_6vt_8w_lsx,
+    common_hv_6ht_4vt_8w_lsx,
+    common_hv_4ht_6vt_8w_lsx,
+    common_hv_4ht_4vt_8w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx,
+                                                        common_vt_4t_8w_lsx,
+                                                        common_hz_6t_8w_lsx,
+                                                        common_hz_4t_8w_lsx };
+
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter, 8);
+                break;
+
+              case 1:
+                Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter + 1, 8);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict8x8Funcs1[2](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter, 8);
+                break;
+
+              case 1:
+                Predict8x8Funcs1[3](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter + 1, 8);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8);
+            break;
+
+          case 1:
+            Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+                                8);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset & 1) {
+        case 1:
+          Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                              8);
+          break;
+      }
+      switch (xoffset) {
+        case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
+        case 2:
+        case 4:
+        case 6:
+          Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8);
+          break;
+      }
+    }
+  }
+}
+
+void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = {
+    common_hv_6ht_6vt_16w_lsx,
+    common_hv_6ht_4vt_16w_lsx,
+    common_hv_4ht_6vt_16w_lsx,
+    common_hv_4ht_4vt_16w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = {
+    common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx,
+    common_hz_4t_16w_lsx
+  };
+
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict16x16Funcs1[0](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter, 16);
+                break;
+
+              case 1:
+                Predict16x16Funcs1[1](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter + 1, 16);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict16x16Funcs1[2](src, src_stride, dst, dst_stride,
+                                      h_filter + 1, v_filter, 16);
+                break;
+
+              case 1:
+                Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter + 1, 16);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter,
+                                  16);
+            break;
+
+          case 1:
+            Predict16x16Funcs2[1](src, src_stride, dst, dst_stride,
+                                  v_filter + 1, 16);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset & 1) {
+        case 1:
+          Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                                16);
+          break;
+      }
+      switch (xoffset) {
+        case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
+        case 2:
+        case 4:
+        case 6:
+          Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16);
+          break;
+      }
+    }
+  }
+}
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 8452b5e854..40117e3677 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -47,13 +47,13 @@ ()
 # Loopfilter
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/;
@@ -146,10 +146,10 @@ ()
 # Subpixel
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 286a93a056..909924ce8d 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -124,6 +124,10 @@ ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
 endif
 
+# common (loongarch LSX intrinsics)
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c
+
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.h
diff --git a/vpx_dsp/loongarch/intrapred_lsx.c b/vpx_dsp/loongarch/intrapred_lsx.c
new file mode 100644
index 0000000000..f990211791
--- /dev/null
+++ b/vpx_dsp/loongarch/intrapred_lsx.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t dst_stride) {
+  uint64_t val0, val1;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i store, sum_h, sum_w, sum_d;
+  __m128i src = { 0 };
+
+  val0 = *(const uint64_t *)src_top;
+  val1 = *(const uint64_t *)src_left;
+  DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src);
+  sum_h = __lsx_vhaddw_hu_bu(src, src);
+  sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vpickev_w(sum_d, sum_d);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vsrari_w(sum_d, 4);
+  store = __lsx_vreplvei_b(sum_w, 0);
+
+  __lsx_vstelm_d(store, dst, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+  dst += dst_stride_x4;
+  __lsx_vstelm_d(store, dst, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+}
+
+static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top,
+                                              const uint8_t *src_left,
+                                              uint8_t *dst,
+                                              int32_t dst_stride) {
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i top, left, out;
+  __m128i sum_h, sum_top, sum_left;
+  __m128i sum_w;
+  __m128i sum_d;
+
+  DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left);
+  DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left);
+  sum_h = __lsx_vadd_h(sum_top, sum_left);
+  sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vpickev_w(sum_d, sum_d);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vsrari_w(sum_d, 5);
+  out = __lsx_vreplvei_b(sum_w, 0);
+
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+}
+
+void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_lsx(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_lsx(above, left, dst, y_stride);
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 01653102ef..c948e12a39 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -74,6 +74,7 @@ endif # CONFIG_POSTPROC
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
 DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
 DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/intrapred_lsx.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 0144b90c26..c721e190b0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -117,7 +117,7 @@ ()
 
 add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 lsx/;
 
 add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
@@ -155,7 +155,7 @@ ()
 specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx lsx/;
 
 add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;

From 232ad814de14e5941857c62a023b6fd66e967bb6 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 8 Feb 2022 12:36:39 -0800
Subject: [PATCH 211/926] rtc-vp9: Fix intra-only for bypass mode

Allow intra-only frame in svc to also work
in bypass (flexible-svc) mode.

Added unittest for the flexible svc case.

And fix the gld_fb_idx for (SL0, TL1) in bypass/flexible
mode pattern in the sample encoder: force it to be 0
(same as lst_fb_idx), since the slot is unused on SL0.

Change-Id: Iada9d1b052e470a0d5d25220809ad0c87cd46268
---
 examples/vp9_spatial_svc_encoder.c |   3 +-
 test/svc_end_to_end_test.cc        | 148 +++++++++++++++++++++++++++--
 vp9/encoder/vp9_ratectrl.c         |  16 +++-
 vp9/encoder/vp9_svc_layercontext.c |   6 ++
 4 files changed, 163 insertions(+), 10 deletions(-)

diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index c37e608d17..455f6c9036 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -579,7 +579,8 @@ static void set_frame_flags_bypass_mode_ex0(
       ref_frame_config->alt_fb_idx[sl] = 0;
     } else if (tl == 1) {
       ref_frame_config->lst_fb_idx[sl] = sl;
-      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
+      ref_frame_config->gld_fb_idx[sl] =
+          (sl == 0) ? 0 : num_spatial_layers + sl - 1;
       ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
     }
     // Set the reference and update flags.
diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index e59e337f1b..c0556d8b7d 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -15,6 +15,7 @@
 #include "test/svc_test.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vp9/common/vp9_onyxc_int.h"
 #include "vpx/vpx_codec.h"
 #include "vpx_ports/bitops.h"
 
@@ -139,6 +140,91 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
     return current_video_frame_ >= frame_to_start_decode_;
   }
 
+  // Example pattern for spatial layers and 2 temporal layers used in the
+  // bypass/flexible mode. The pattern corresponds to the pattern
+  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+  // non-flexible mode.
+  void set_frame_flags_bypass_mode(
+      int tl, int num_spatial_layers, int is_key_frame,
+      vpx_svc_ref_frame_config_t *ref_frame_config) {
+    int sl;
+    for (sl = 0; sl < num_spatial_layers; ++sl)
+      ref_frame_config->update_buffer_slot[sl] = 0;
+
+    for (sl = 0; sl < num_spatial_layers; ++sl) {
+      // Set the buffer idx.
+      if (tl == 0) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        if (sl) {
+          if (is_key_frame) {
+            ref_frame_config->lst_fb_idx[sl] = sl - 1;
+            ref_frame_config->gld_fb_idx[sl] = sl;
+          } else {
+            ref_frame_config->gld_fb_idx[sl] = sl - 1;
+          }
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = 0;
+        }
+        ref_frame_config->alt_fb_idx[sl] = 0;
+      } else if (tl == 1) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        ref_frame_config->gld_fb_idx[sl] =
+            (sl == 0) ? 0 : num_spatial_layers + sl - 1;
+        ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+      }
+      // Set the reference and update flags.
+      if (!tl) {
+        if (!sl) {
+          // Base spatial and base temporal (sl = 0, tl = 0)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->lst_fb_idx[sl];
+        } else {
+          if (is_key_frame) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 0;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->gld_fb_idx[sl];
+          } else {
+            // Non-zero spatiall layer.
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 1;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->lst_fb_idx[sl];
+          }
+        }
+      } else if (tl == 1) {
+        if (!sl) {
+          // Base spatial and top temporal (tl = 1)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        } else {
+          // Non-zero spatial.
+          if (sl < num_spatial_layers - 1) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->alt_fb_idx[sl];
+          } else if (sl == num_spatial_layers - 1) {
+            // Top spatial and top temporal (non-reference -- doesn't
+            // update any reference buffers).
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+          }
+        }
+      }
+    }
+  }
+
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     current_video_frame_ = video->frame();
@@ -158,6 +244,20 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
 
       encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_);
     }
+    if (flexible_mode_) {
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+    }
     if (video->frame() == frame_to_sync_) {
       encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_);
     }
@@ -226,6 +326,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   vpx_svc_spatial_layer_sync_t svc_layer_sync_;
   unsigned int mismatch_nframes_;
   unsigned int num_nonref_frames_;
+  bool flexible_mode_;
+  vpx_svc_ref_frame_config_t ref_frame_config;
 
  private:
   virtual void SetConfig(const int num_temporal_layer) {
@@ -275,6 +377,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLFullSync) {
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
 
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -302,6 +405,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncToVGA) {
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   cfg_.rc_target_bitrate = 400;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -329,6 +433,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToHD) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -356,6 +461,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToVGAHD) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -385,6 +491,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) {
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   cfg_.rc_target_bitrate = 400;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -395,6 +502,34 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) {
 }
 #endif
 
+// Encode 3 spatial, 2 temporal layer in flexible mode but don't
+// start decoding. During the sequence insert intra-only on base/qvga
+// layer at frame 20 and start decoding only QVGA layer from there.
+TEST_P(SyncFrameOnePassCbrSvc,
+       OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGAFlex) {
+  SetSvcConfig(3, 2);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = true;
+  AssignLayerBitrates();
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Can't check mismatch here because only base is decoded at
+  // frame sync, whereas encoder continues encoding all layers.
+}
+
 // Encode 3 spatial, 3 temporal layer but don't start decoding.
 // During the sequence insert intra-only on base/qvga layer at frame 20
 // and start decoding only QVGA layer from there.
@@ -415,15 +550,11 @@ TEST_P(SyncFrameOnePassCbrSvc,
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-#if CONFIG_VP9_DECODER
-  // The non-reference frames are expected to be mismatched frames as the
-  // encoder will avoid loopfilter on these frames.
-  if (0 && decode_to_layer_before_sync_ == decode_to_layer_after_sync_) {
-    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
-  }
-#endif
+  // Can't check mismatch here because only base is decoded at
+  // frame sync, whereas encoder continues encoding all layers.
 }
 
 // Start decoding from beginning of sequence, during sequence insert intra-only
@@ -447,6 +578,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -477,6 +609,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -502,6 +635,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index ac346115fb..0852973914 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2214,7 +2214,6 @@ static void set_intra_only_frame(VP9_COMP *cpi) {
   // only 3 reference buffers can be updated, but for temporal layers > 1
   // we generally need to use buffer slots 4 and 5.
   if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) ||
-      svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS ||
       svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 ||
       svc->number_spatial_layers == 1)
     return;
@@ -2235,11 +2234,15 @@ static void set_intra_only_frame(VP9_COMP *cpi) {
     cpi->lst_fb_idx = -1;
     cpi->gld_fb_idx = -1;
     cpi->alt_fb_idx = -1;
+    svc->update_buffer_slot[0] = 0;
     // For intra-only frame we need to refresh all slots that were
     // being used for the base layer (fb_idx_base[i] == 1).
     // Start with assigning last first, then golden and then alt.
     for (i = 0; i < REF_FRAMES; ++i) {
-      if (svc->fb_idx_base[i] == 1) count++;
+      if (svc->fb_idx_base[i] == 1) {
+        svc->update_buffer_slot[0] |= 1 << i;
+        count++;
+      }
       if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i;
       if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i;
       if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i;
@@ -2248,6 +2251,12 @@ static void set_intra_only_frame(VP9_COMP *cpi) {
     // to the lst_fb_idx.
     if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx;
     if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx;
+    if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      cpi->ext_refresh_last_frame = 0;
+      cpi->ext_refresh_golden_frame = 0;
+      cpi->ext_refresh_alt_ref_frame = 0;
+      cpi->ref_frame_flags = 0;
+    }
   }
 }
 
@@ -2390,6 +2399,9 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
     set_intra_only_frame(cpi);
     target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
   }
+  // Overlay frame predicts from LAST (intra-only)
+  if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG;
+
   // Any update/change of global cyclic refresh parameters (amount/delta-qp)
   // should be done here, before the frame qp is selected.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 6655643654..a57a70ab16 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -1234,6 +1234,7 @@ void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) {
 
 void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
+  int i = 0;
   // Update the usage of frame buffer index for base spatial layers.
   if (svc->spatial_layer_id == 0) {
     if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame)
@@ -1242,6 +1243,11 @@ void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) {
       svc->fb_idx_base[cpi->gld_fb_idx] = 1;
     if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame)
       svc->fb_idx_base[cpi->alt_fb_idx] = 1;
+    // For bypass/flexible mode: check for refresh slots.
+    if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        if (svc->update_buffer_slot[0] & (1 << i)) svc->fb_idx_base[i] = 1;
+    }
   }
 }
 

From cafe7cc1f10cfea74edb2ded7c3df2d69fcf1eee Mon Sep 17 00:00:00 2001
From: Gregor Jasny <gjasny@googlemail.com>
Date: Thu, 10 Feb 2022 09:01:49 +0100
Subject: [PATCH 212/926] support visual studio 2022 (vs17)

Change-Id: I8380283d09b0c90183f224399f953dcc527181c5
---
 README                         | 2 ++
 build/make/gen_msvs_sln.sh     | 5 +++--
 build/make/gen_msvs_vcxproj.sh | 5 ++++-
 configure                      | 2 ++
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/README b/README
index ddbcb9f695..a083ebf90e 100644
--- a/README
+++ b/README
@@ -103,6 +103,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-win32-vs14
     x86-win32-vs15
     x86-win32-vs16
+    x86-win32-vs17
     x86_64-android-gcc
     x86_64-darwin9-gcc
     x86_64-darwin10-gcc
@@ -124,6 +125,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-win64-vs14
     x86_64-win64-vs15
     x86_64-win64-vs16
+    x86_64-win64-vs17
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/build/make/gen_msvs_sln.sh b/build/make/gen_msvs_sln.sh
index d1adfd749c..0b312850fe 100755
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -25,7 +25,7 @@ files.
 Options:
     --help                      Print this message
     --out=outfile               Redirect output to a file
-    --ver=version               Version (14-16) of visual studio to generate for
+    --ver=version               Version (14-17) of visual studio to generate for
     --target=isa-os-cc          Target specifier
 EOF
     exit 1
@@ -219,6 +219,7 @@ for opt in "$@"; do
         14) vs_year=2015 ;;
         15) vs_year=2017 ;;
         16) vs_year=2019 ;;
+        17) vs_year=2022 ;;
         *) die Unrecognized Visual Studio Version in $opt ;;
       esac
     ;;
@@ -232,7 +233,7 @@ done
 outfile=${outfile:-/dev/stdout}
 mkoutfile=${mkoutfile:-/dev/stdout}
 case "${vs_ver}" in
-    1[4-6])
+    1[4-7])
       # VS has used Format Version 12.00 continuously since vs11.
       sln_vers="12.00"
       sln_vers_str="Visual Studio ${vs_year}"
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 6f91ad4781..58bb66b9e3 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -170,7 +170,7 @@ for opt in "$@"; do
         --ver=*)
             vs_ver="$optval"
             case "$optval" in
-                1[4-6])
+                1[4-7])
                 ;;
                 *) die Unrecognized Visual Studio Version in $opt
                 ;;
@@ -344,6 +344,9 @@ generate_vcxproj() {
             if [ "$vs_ver" = "16" ]; then
                 tag_content PlatformToolset v142
             fi
+            if [ "$vs_ver" = "17" ]; then
+                tag_content PlatformToolset v143
+            fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
                 tag_content WholeProgramOptimization true
diff --git a/configure b/configure
index 434ebbe366..beea650329 100755
--- a/configure
+++ b/configure
@@ -142,6 +142,7 @@ all_platforms="${all_platforms} x86-win32-gcc"
 all_platforms="${all_platforms} x86-win32-vs14"
 all_platforms="${all_platforms} x86-win32-vs15"
 all_platforms="${all_platforms} x86-win32-vs16"
+all_platforms="${all_platforms} x86-win32-vs17"
 all_platforms="${all_platforms} x86_64-android-gcc"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
@@ -164,6 +165,7 @@ all_platforms="${all_platforms} x86_64-win64-gcc"
 all_platforms="${all_platforms} x86_64-win64-vs14"
 all_platforms="${all_platforms} x86_64-win64-vs15"
 all_platforms="${all_platforms} x86_64-win64-vs16"
+all_platforms="${all_platforms} x86_64-win64-vs17"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured

From 2da19ac0332f0b60cd2e4e2c7fa1748eb3ac85a7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 11 Feb 2022 12:43:29 -0800
Subject: [PATCH 213/926] svc_datarate_test.cc: remove stale TODO

Bug: webm:1554
Change-Id: I547223763b86c6a24fa32851f7b30ebab4b7472a
---
 test/svc_datarate_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 95d82ce54e..291cb01280 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -1354,7 +1354,6 @@ TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc3SL3TLSmallKf) {
   ResetModel();
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  // TODO(jianj): webm:1554
   CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
                           1.15);
 #if CONFIG_VP9_DECODER

From 3b21aeac8b7d5a52b6360d878cb4df593e87113e Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Wed, 23 Feb 2022 11:23:52 +0800
Subject: [PATCH 214/926] vp9[loongarch]: Optimize
 lpf_horizontal/vertical_16_dual with LSX

Change-Id: I82c6bc16ea57c3f7ac5f4d212a12a5f70cb55ffc
---
 vpx_dsp/loongarch/loopfilter_16_lsx.c | 1330 +++++++++++++++++++++++++
 vpx_dsp/loongarch/loopfilter_lsx.h    |  167 ++++
 vpx_dsp/vpx_dsp.mk                    |    2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl          |    4 +-
 4 files changed, 1501 insertions(+), 2 deletions(-)
 create mode 100644 vpx_dsp/loongarch/loopfilter_16_lsx.c
 create mode 100644 vpx_dsp/loongarch/loopfilter_lsx.h

diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c
new file mode 100644
index 0000000000..cbaefcd6e0
--- /dev/null
+++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -0,0 +1,1330 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+#include "vpx_ports/mem.h"
+
+#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \
+                 _in2, _in3, _in4, _in5, _in6, _in7)                      \
+  {                                                                       \
+    _in0 = __lsx_vld(_src, 0);                                            \
+    _in1 = __lsx_vldx(_src, _stride);                                     \
+    _in2 = __lsx_vldx(_src, _stride2);                                    \
+    _in3 = __lsx_vldx(_src, _stride3);                                    \
+    _src += _stride4;                                                     \
+    _in4 = __lsx_vld(_src, 0);                                            \
+    _in5 = __lsx_vldx(_src, _stride);                                     \
+    _in6 = __lsx_vldx(_src, _stride2);                                    \
+    _in7 = __lsx_vldx(_src, _stride3);                                    \
+  }
+
+#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \
+                 _stride, _stride2, _stride3, _stride4)                        \
+  {                                                                            \
+    __lsx_vst(_dst0, _dst, 0);                                                 \
+    __lsx_vstx(_dst1, _dst, _stride);                                          \
+    __lsx_vstx(_dst2, _dst, _stride2);                                         \
+    __lsx_vstx(_dst3, _dst, _stride3);                                         \
+    _dst += _stride4;                                                          \
+    __lsx_vst(_dst4, _dst, 0);                                                 \
+    __lsx_vstx(_dst5, _dst, _stride);                                          \
+    __lsx_vstx(_dst6, _dst, _stride2);                                         \
+    __lsx_vstx(_dst7, _dst, _stride3);                                         \
+  }
+
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
+                                    uint8_t *filter48,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
+  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
+  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vstx(p1_out, dst, -stride2);
+    __lsx_vstx(p0_out, dst, -stride);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vstx(q1_out, dst, stride);
+
+    return 1;
+  }
+
+  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+            p1_l, p0_l);
+  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+            q2_l, q3_l);
+
+  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h,
+            p1_h, p0_h);
+  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h,
+            q2_h, q3_h);
+  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+              p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+  /* convert 16 bit output data into 8 bit */
+  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+            p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+            p1_filt8_l, p0_filt8_l, q0_filt8_l);
+  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+            q1_filt8_l, q2_filt8_l);
+
+  /* store pixel values */
+  DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat,
+            p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out,
+            p0_out, q0_out);
+  DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat,
+            q1_out, q2_out);
+
+  __lsx_vst(p2_out, filter48, 0);
+  __lsx_vst(p1_out, filter48, 16);
+  __lsx_vst(p0_out, filter48, 32);
+  __lsx_vst(q0_out, filter48, 48);
+  __lsx_vst(q1_out, filter48, 64);
+  __lsx_vst(q2_out, filter48, 80);
+  __lsx_vst(flat, filter48, 96);
+
+  return 0;
+}
+
+static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  uint8_t *dst_tmp0 = dst - stride4;
+  uint8_t *dst_tmp1 = dst + stride4;
+
+  __m128i flat, flat2, filter8;
+  __m128i zero = __lsx_vldi(0);
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i out_h, out_l;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+
+  flat = __lsx_vld(filter48, 96);
+
+  DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+            -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+
+  p3 = __lsx_vld(dst_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1);
+  p0 = __lsx_vldx(dst_tmp0, stride3);
+
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  q4 = __lsx_vld(dst_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+  q7 = __lsx_vldx(dst_tmp1, stride3);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__lsx_bz_v(flat2)) {
+    DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+              p2, p1, p0, q0);
+    DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+    __lsx_vstx(p2, dst, -stride3);
+    __lsx_vstx(p1, dst, -stride2);
+    __lsx_vstx(p0, dst, -stride);
+    __lsx_vst(q0, dst, 0);
+    __lsx_vstx(q1, dst, stride);
+    __lsx_vstx(q2, dst, stride2);
+  } else {
+    dst = dst_tmp0 - stride3;
+
+    p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
+    p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
+    p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
+    p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
+    p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
+    p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
+    p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
+    p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
+
+    q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
+    p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
+    p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
+    p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
+
+    p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
+    p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
+    p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
+    p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
+    q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
+
+    tmp0_h = p7_h_in << 3;
+    tmp0_h -= p7_h_in;
+    tmp0_h += p6_h_in;
+    tmp0_h += q0_h_in;
+    tmp1_h = p6_h_in + p5_h_in;
+    tmp1_h += p4_h_in;
+    tmp1_h += p3_h_in;
+    tmp1_h += p2_h_in;
+    tmp1_h += p1_h_in;
+    tmp1_h += p0_h_in;
+    tmp1_h += tmp0_h;
+
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+    __lsx_vst(p6, dst, 0);
+    dst += stride;
+
+    /* p5 */
+    q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
+    tmp0_h = p5_h_in - p6_h_in;
+    tmp0_h += q1_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+    __lsx_vst(p5, dst, 0);
+    dst += stride;
+
+    /* p4 */
+    q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
+    tmp0_h = p4_h_in - p5_h_in;
+    tmp0_h += q2_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+    __lsx_vst(p4, dst, 0);
+    dst += stride;
+
+    /* p3 */
+    q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
+    tmp0_h = p3_h_in - p4_h_in;
+    tmp0_h += q3_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+    __lsx_vst(p3, dst, 0);
+    dst += stride;
+
+    /* p2 */
+    q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
+    filter8 = __lsx_vld(filter48, 0);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
+    tmp0_h = p2_h_in - p3_h_in;
+    tmp0_h += q4_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* p1 */
+    q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
+    filter8 = __lsx_vld(filter48, 16);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
+    tmp0_h = p1_h_in - p2_h_in;
+    tmp0_h += q5_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* p0 */
+    q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
+    filter8 = __lsx_vld(filter48, 32);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
+    tmp0_h = p0_h_in - p1_h_in;
+    tmp0_h += q6_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q0 */
+    q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
+    filter8 = __lsx_vld(filter48, 48);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
+    tmp0_h = q7_h_in - p0_h_in;
+    tmp0_h += q0_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q1 */
+    filter8 = __lsx_vld(filter48, 64);
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q0_h_in;
+    tmp0_h += q1_h_in;
+    tmp0_h -= p6_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q2 */
+    filter8 = __lsx_vld(filter48, 80);
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q1_h_in;
+    tmp0_h += q2_h_in;
+    tmp0_h -= p5_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q3 */
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q2_h_in;
+    tmp0_h += q3_h_in;
+    tmp0_h -= p4_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+    __lsx_vst(q3, dst, 0);
+    dst += stride;
+
+    /* q4 */
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q3_h_in;
+    tmp0_h += q4_h_in;
+    tmp0_h -= p3_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+    __lsx_vst(q4, dst, 0);
+    dst += stride;
+
+    /* q5 */
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q4_h_in;
+    tmp0_h += q5_h_in;
+    tmp0_h -= p2_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+    __lsx_vst(q5, dst, 0);
+    dst += stride;
+
+    /* q6 */
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q5_h_in;
+    tmp0_h += q6_h_in;
+    tmp0_h -= p1_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+    __lsx_vst(q6, dst, 0);
+  }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr) {
+  DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]);
+  uint8_t early_exit = 0;
+
+  early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr,
+                                    limit_ptr, thresh_ptr);
+
+  if (early_exit == 0) {
+    hz_lpf_t16_16w(dst, stride, filter48);
+  }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr, int32_t count) {
+  if (count == 1) {
+    __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    __m128i p0_filter16, p1_filter16;
+    __m128i p2_filter8, p1_filter8, p0_filter8;
+    __m128i q0_filter8, q1_filter8, q2_filter8;
+    __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
+    __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp0, tmp1, tmp2;
+
+    int32_t stride2 = stride << 1;
+    int32_t stride3 = 2 + stride;
+    int32_t stride4 = stride << 2;
+    uint8_t *dst_tmp0 = dst - stride4;
+    uint8_t *dst_tmp1 = dst + stride4;
+
+    /* load vector elements */
+    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+              -stride, p3, p2, p1, p0);
+    q0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+    q3 = __lsx_vldx(dst, stride3);
+
+    thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
+    b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
+    limit = __lsx_vreplgr2vr_b(*limit_ptr);
+
+    /* filter_mask* */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+                 mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+    flat = __lsx_vilvl_d(zero, flat);
+    if (__lsx_bz_v(flat)) {
+      __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+      __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+      __lsx_vstelm_d(q0_out, dst, 0, 0);
+      __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+    } else {
+      /* convert 8 bit input data into 16 bit */
+      DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l,
+                p2_l, p1_l, p0_l);
+      DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l,
+                q1_l, q2_l, q3_l);
+      VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+      /* convert 16 bit output data into 8 bit */
+      DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero,
+                p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8,
+                p0_filter8, q0_filter8);
+      DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8,
+                q2_filter8);
+
+      /* store pixel values */
+      p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
+      p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
+      p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
+      q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
+      q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
+      q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
+
+      /* load 16 vector elements */
+      DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+                -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+      q4 = __lsx_vld(dst_tmp1, 0);
+      DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+      q7 = __lsx_vldx(dst_tmp1, stride3);
+
+      VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+      if (__lsx_bz_v(flat2)) {
+        dst -= stride3;
+        __lsx_vstelm_d(p2_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p0_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q0_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q1_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q2_out, dst, 0, 0);
+      } else {
+        /* LSB(right) 8 pixel operation */
+        DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l,
+                  p6_l, p5_l, p4_l);
+        DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l,
+                  q5_l, q6_l, q7_l);
+
+        tmp0 = __lsx_vslli_h(p7_l, 3);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp0 = __lsx_vadd_h(tmp0, p6_l);
+        tmp0 = __lsx_vadd_h(tmp0, q0_l);
+
+        dst = dst_tmp0 - stride3;
+
+        /* calculation of p6 and p5 */
+        tmp1 = __lsx_vadd_h(p6_l, p5_l);
+        tmp1 = __lsx_vadd_h(tmp1, p4_l);
+        tmp1 = __lsx_vadd_h(tmp1, p3_l);
+        tmp1 = __lsx_vadd_h(tmp1, p2_l);
+        tmp1 = __lsx_vadd_h(tmp1, p1_l);
+        tmp1 = __lsx_vadd_h(tmp1, p0_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp0 = __lsx_vsub_h(p5_l, p6_l);
+        tmp0 = __lsx_vadd_h(tmp0, q1_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p4 and p3 */
+        tmp0 = __lsx_vsub_h(p4_l, p5_l);
+        tmp0 = __lsx_vadd_h(tmp0, q2_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(p3_l, p4_l);
+        tmp2 = __lsx_vadd_h(tmp2, q3_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p2 and p1 */
+        tmp0 = __lsx_vsub_h(p2_l, p3_l);
+        tmp0 = __lsx_vadd_h(tmp0, q4_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(p1_l, p2_l);
+        tmp2 = __lsx_vadd_h(tmp2, q5_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p0 and q0 */
+        tmp0 = __lsx_vsub_h(p0_l, p1_l);
+        tmp0 = __lsx_vadd_h(tmp0, q6_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(q7_l, p0_l);
+        tmp2 = __lsx_vadd_h(tmp2, q0_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q1 and q2 */
+        tmp0 = __lsx_vsub_h(q7_l, q0_l);
+        tmp0 = __lsx_vadd_h(tmp0, q1_l);
+        tmp0 = __lsx_vsub_h(tmp0, p6_l);
+        tmp2 = __lsx_vsub_h(q7_l, q1_l);
+        tmp2 = __lsx_vadd_h(tmp2, q2_l);
+        tmp2 = __lsx_vsub_h(tmp2, p5_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q3 and q4 */
+        tmp0 = __lsx_vsub_h(q7_l, q2_l);
+        tmp0 = __lsx_vadd_h(tmp0, q3_l);
+        tmp0 = __lsx_vsub_h(tmp0, p4_l);
+        tmp2 = __lsx_vsub_h(q7_l, q3_l);
+        tmp2 = __lsx_vadd_h(tmp2, q4_l);
+        tmp2 = __lsx_vsub_h(tmp2, p3_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q5 and q6 */
+        tmp0 = __lsx_vsub_h(q7_l, q4_l);
+        tmp0 = __lsx_vadd_h(tmp0, q5_l);
+        tmp0 = __lsx_vsub_h(tmp0, p2_l);
+        tmp2 = __lsx_vsub_h(q7_l, q5_l);
+        tmp2 = __lsx_vadd_h(tmp2, q6_l);
+        tmp2 = __lsx_vsub_h(tmp2, p1_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+      }
+    }
+  } else {
+    mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr,
+                                thresh_ptr);
+  }
+}
+
+void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output,
+                            int32_t out_stride) {
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp2, tmp3;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  int32_t in_stride2 = in_stride << 1;
+  int32_t in_stride3 = in_stride2 + in_stride;
+  int32_t in_stride4 = in_stride2 << 1;
+  int32_t out_stride2 = out_stride << 1;
+  int32_t out_stride3 = out_stride2 + out_stride;
+  int32_t out_stride4 = out_stride2 << 1;
+
+  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1,
+           row2, row3, row4, row5, row6, row7);
+  input += in_stride4;
+  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9,
+           row10, row11, row12, row13, row14, row15);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
+                      p5, p4, p3, p2, p1, p0);
+
+  /* transpose 16x8 matrix into 8x16 */
+  /* total 8 intermediate register and 32 instructions */
+  q7 = __lsx_vpackod_d(row8, row0);
+  q6 = __lsx_vpackod_d(row9, row1);
+  q5 = __lsx_vpackod_d(row10, row2);
+  q4 = __lsx_vpackod_d(row11, row3);
+  q3 = __lsx_vpackod_d(row12, row4);
+  q2 = __lsx_vpackod_d(row13, row5);
+  q1 = __lsx_vpackod_d(row14, row6);
+  q0 = __lsx_vpackod_d(row15, row7);
+
+  DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
+
+  DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
+  DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
+
+  DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
+  q0 = __lsx_vpackev_w(tmp3, tmp2);
+  q4 = __lsx_vpackod_w(tmp3, tmp2);
+
+  tmp2 = __lsx_vpackod_h(tmp1, tmp0);
+  tmp3 = __lsx_vpackod_h(q7, q5);
+  q2 = __lsx_vpackev_w(tmp3, tmp2);
+  q6 = __lsx_vpackod_w(tmp3, tmp2);
+
+  DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
+  q1 = __lsx_vpackev_w(tmp3, tmp2);
+  q5 = __lsx_vpackod_w(tmp3, tmp2);
+
+  tmp2 = __lsx_vpackod_h(tmp5, tmp4);
+  tmp3 = __lsx_vpackod_h(tmp7, tmp6);
+  q3 = __lsx_vpackev_w(tmp3, tmp2);
+  q7 = __lsx_vpackod_w(tmp3, tmp2);
+
+  LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2,
+           out_stride3, out_stride4);
+  output += out_stride4;
+  LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2,
+           out_stride3, out_stride4);
+}
+
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
+                                    uint8_t *dst_org, int32_t stride,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+  __m128i zero = __lsx_vldi(0);
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0);
+  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+
+  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
+  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
+  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec2 = __lsx_vilvl_h(vec1, vec0);
+    vec3 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec4 = __lsx_vilvl_h(vec1, vec0);
+    vec5 = __lsx_vilvh_h(vec1, vec0);
+
+    dst_org -= 2;
+    __lsx_vstelm_w(vec2, dst_org, 0, 0);
+    __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec3, dst_org, 0, 0);
+    __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec4, dst_org, 0, 0);
+    __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec5, dst_org, 0, 0);
+    __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
+
+    return 1;
+  }
+
+  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+            p1_l, p0_l);
+  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+            q2_l, q3_l);
+  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h,
+            p1_h, p0_h);
+  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h,
+            q2_h, q3_h);
+  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+              p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+  /* convert 16 bit output data into 8 bit */
+  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+            p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+            p1_filt8_l, p0_filt8_l, q0_filt8_l);
+  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+            q1_filt8_l, q2_filt8_l);
+
+  /* store pixel values */
+  p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+  q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+  __lsx_vst(p2_out, filter48, 0);
+  __lsx_vst(p1_out, filter48, 16);
+  __lsx_vst(p0_out, filter48, 32);
+  __lsx_vst(q0_out, filter48, 48);
+  __lsx_vst(q1_out, filter48, 64);
+  __lsx_vst(q2_out, filter48, 80);
+  __lsx_vst(flat, filter48, 96);
+
+  return 0;
+}
+
+static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
+                              uint8_t *filter48) {
+  __m128i zero = __lsx_vldi(0);
+  __m128i flat, flat2, filter8;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i out_l, out_h;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+  uint8_t *dst_tmp = dst - 128;
+
+  flat = __lsx_vld(filter48, 96);
+
+  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7,
+            p6, p5, p4);
+  DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+  DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+  /* if flat2 is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat2)) {
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+              p2, p1, p0, q0);
+    DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
+    vec3 = __lsx_vilvl_h(vec1, vec0);
+    vec4 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
+    vec6 = __lsx_vilvl_h(vec1, vec0);
+    vec7 = __lsx_vilvh_h(vec1, vec0);
+    vec2 = __lsx_vilvl_b(q2, q1);
+    vec5 = __lsx_vilvh_b(q2, q1);
+
+    dst_org -= 3;
+    __lsx_vstelm_w(vec3, dst_org, 0, 0);
+    __lsx_vstelm_h(vec2, dst_org, 4, 0);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 1);
+    __lsx_vstelm_h(vec2, dst_org, 4, 1);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 2);
+    __lsx_vstelm_h(vec2, dst_org, 4, 2);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 3);
+    __lsx_vstelm_h(vec2, dst_org, 4, 3);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 0);
+    __lsx_vstelm_h(vec2, dst_org, 4, 4);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 1);
+    __lsx_vstelm_h(vec2, dst_org, 4, 5);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 2);
+    __lsx_vstelm_h(vec2, dst_org, 4, 6);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 3);
+    __lsx_vstelm_h(vec2, dst_org, 4, 7);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 0);
+    __lsx_vstelm_h(vec5, dst_org, 4, 0);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 1);
+    __lsx_vstelm_h(vec5, dst_org, 4, 1);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 2);
+    __lsx_vstelm_h(vec5, dst_org, 4, 2);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 3);
+    __lsx_vstelm_h(vec5, dst_org, 4, 3);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 0);
+    __lsx_vstelm_h(vec5, dst_org, 4, 4);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 1);
+    __lsx_vstelm_h(vec5, dst_org, 4, 5);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 2);
+    __lsx_vstelm_h(vec5, dst_org, 4, 6);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 3);
+    __lsx_vstelm_h(vec5, dst_org, 4, 7);
+
+    return 1;
+  }
+
+  dst -= 7 * 16;
+
+  p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
+  p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
+  p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
+  p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
+  p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
+  p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
+  p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
+  p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
+  q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
+
+  tmp0_l = p7_l_in << 3;
+  tmp0_l -= p7_l_in;
+  tmp0_l += p6_l_in;
+  tmp0_l += q0_l_in;
+  tmp1_l = p6_l_in + p5_l_in;
+  tmp1_l += p4_l_in;
+  tmp1_l += p3_l_in;
+  tmp1_l += p2_l_in;
+  tmp1_l += p1_l_in;
+  tmp1_l += p0_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
+  p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
+  p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
+  p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
+  p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
+  p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
+  p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
+  p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
+  q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
+
+  tmp0_h = p7_h_in << 3;
+  tmp0_h -= p7_h_in;
+  tmp0_h += p6_h_in;
+  tmp0_h += q0_h_in;
+  tmp1_h = p6_h_in + p5_h_in;
+  tmp1_h += p4_h_in;
+  tmp1_h += p3_h_in;
+  tmp1_h += p2_h_in;
+  tmp1_h += p1_h_in;
+  tmp1_h += p0_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+  __lsx_vst(p6, dst, 0);
+
+  /* p5 */
+  q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
+  tmp0_l = p5_l_in - p6_l_in;
+  tmp0_l += q1_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
+  tmp0_h = p5_h_in - p6_h_in;
+  tmp0_h += q1_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+  __lsx_vst(p5, dst, 16);
+
+  /* p4 */
+  q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
+  tmp0_l = p4_l_in - p5_l_in;
+  tmp0_l += q2_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
+  tmp0_h = p4_h_in - p5_h_in;
+  tmp0_h += q2_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+  __lsx_vst(p4, dst, 16 * 2);
+
+  /* p3 */
+  q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
+  tmp0_l = p3_l_in - p4_l_in;
+  tmp0_l += q3_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
+  tmp0_h = p3_h_in - p4_h_in;
+  tmp0_h += q3_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+  __lsx_vst(p3, dst, 16 * 3);
+
+  /* p2 */
+  q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
+  filter8 = __lsx_vld(filter48, 0);
+  tmp0_l = p2_l_in - p3_l_in;
+  tmp0_l += q4_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
+  tmp0_h = p2_h_in - p3_h_in;
+  tmp0_h += q4_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 4);
+
+  /* p1 */
+  q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
+  filter8 = __lsx_vld(filter48, 16);
+  tmp0_l = p1_l_in - p2_l_in;
+  tmp0_l += q5_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
+  tmp0_h = p1_h_in - p2_h_in;
+  tmp0_h += q5_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 5);
+
+  /* p0 */
+  q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
+  filter8 = __lsx_vld(filter48, 32);
+  tmp0_l = p0_l_in - p1_l_in;
+  tmp0_l += q6_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
+  tmp0_h = p0_h_in - p1_h_in;
+  tmp0_h += q6_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 6);
+
+  /* q0 */
+  q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
+  filter8 = __lsx_vld(filter48, 48);
+  tmp0_l = q7_l_in - p0_l_in;
+  tmp0_l += q0_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
+  tmp0_h = q7_h_in - p0_h_in;
+  tmp0_h += q0_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 7);
+
+  /* q1 */
+  filter8 = __lsx_vld(filter48, 64);
+  tmp0_l = q7_l_in - q0_l_in;
+  tmp0_l += q1_l_in;
+  tmp0_l -= p6_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q0_h_in;
+  tmp0_h += q1_h_in;
+  tmp0_h -= p6_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 8);
+
+  /* q2 */
+  filter8 = __lsx_vld(filter48, 80);
+  tmp0_l = q7_l_in - q1_l_in;
+  tmp0_l += q2_l_in;
+  tmp0_l -= p5_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q1_h_in;
+  tmp0_h += q2_h_in;
+  tmp0_h -= p5_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 9);
+
+  /* q3 */
+  tmp0_l = q7_l_in - q2_l_in;
+  tmp0_l += q3_l_in;
+  tmp0_l -= p4_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q2_h_in;
+  tmp0_h += q3_h_in;
+  tmp0_h -= p4_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+  __lsx_vst(q3, dst, 16 * 10);
+
+  /* q4 */
+  tmp0_l = q7_l_in - q3_l_in;
+  tmp0_l += q4_l_in;
+  tmp0_l -= p3_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q3_h_in;
+  tmp0_h += q4_h_in;
+  tmp0_h -= p3_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+  __lsx_vst(q4, dst, 16 * 11);
+
+  /* q5 */
+  tmp0_l = q7_l_in - q4_l_in;
+  tmp0_l += q5_l_in;
+  tmp0_l -= p2_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q4_h_in;
+  tmp0_h += q5_h_in;
+  tmp0_h -= p2_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+  __lsx_vst(q5, dst, 16 * 12);
+
+  /* q6 */
+  tmp0_l = q7_l_in - q5_l_in;
+  tmp0_l += q6_l_in;
+  tmp0_l -= p1_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q5_h_in;
+  tmp0_h += q6_h_in;
+  tmp0_h -= p1_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+  __lsx_vst(q6, dst, 16 * 13);
+
+  return 0;
+}
+
+void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch,
+                                  const uint8_t *b_limit_ptr,
+                                  const uint8_t *limit_ptr,
+                                  const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+  early_exit =
+      vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                           pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+  if (early_exit == 0) {
+    early_exit =
+        vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
+
+    if (early_exit == 0) {
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/loopfilter_lsx.h b/vpx_dsp/loongarch/loopfilter_lsx.h
new file mode 100644
index 0000000000..53e15fe6d5
--- /dev/null
+++ b/vpx_dsp/loongarch/loopfilter_lsx.h
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  {                                                                          \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
+                                                                             \
+    /* absolute subtraction of pixel values */                               \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
+                                                                             \
+    /* calculation of hev */                                                 \
+    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
+                                                                             \
+    /* calculation of mask */                                                \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
+    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
+    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
+                                                                             \
+    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
+    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
+  }
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)          \
+  {                                                                            \
+    __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0;                    \
+    __m128i flat4_tmp = __lsx_vldi(1);                                         \
+                                                                               \
+    DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \
+              q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0);          \
+    p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0);                        \
+    flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out);                            \
+    p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0);                        \
+    flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out);                            \
+                                                                               \
+    flat_out = __lsx_vslt_bu(flat4_tmp, flat_out);                             \
+    flat_out = __lsx_vxori_b(flat_out, 0xff);                                  \
+    flat_out = flat_out & (mask);                                              \
+  }
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in,      \
+                  q6_in, q7_in, flat_in, flat2_out)                            \
+  {                                                                            \
+    __m128i flat5_tmp = __lsx_vldi(1);                                         \
+    __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0;                    \
+    __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0;                    \
+    DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \
+              q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0);          \
+    DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \
+              q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0);          \
+                                                                               \
+    DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0,   \
+              p4_asub_p0, flat2_out);                                          \
+    flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out);                          \
+    p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0);                        \
+    flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out);                          \
+    p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0);                        \
+    flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out);                          \
+    flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out);                           \
+    flat2_out = __lsx_vxori_b(flat2_out, 0xff);                                \
+    flat2_out = flat2_out & flat_in;                                           \
+  }
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out,  \
+                           p0_out, q0_out, q1_out)                         \
+  {                                                                        \
+    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;               \
+    const __m128i cnst4b = __lsx_vldi(4);                                  \
+    const __m128i cnst3b = __lsx_vldi(3);                                  \
+    DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \
+              0x80, p1_m, p0_m, q0_m, q1_m);                               \
+    filt = __lsx_vssub_b(p1_m, q1_m);                                      \
+    filt &= hev;                                                           \
+                                                                           \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt &= mask;                                                          \
+    DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2);          \
+    DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2);                        \
+                                                                           \
+    q0_m = __lsx_vssub_b(q0_m, t1);                                        \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                                        \
+    DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out);      \
+                                                                           \
+    filt = __lsx_vsrari_b(t1, 1);                                          \
+    hev = __lsx_vxori_b(hev, 0xff);                                        \
+    filt &= hev;                                                           \
+    q1_m = __lsx_vssub_b(q1_m, filt);                                      \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                                      \
+    DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out);      \
+  }
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+                    q1_filt8_out, q2_filt8_out)                             \
+  {                                                                         \
+    __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                          \
+                                                                            \
+    tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in);                               \
+    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
+    tmp_filt8_0 = __lsx_vslli_h(p3_in, 1);                                  \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2);                   \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in);                         \
+    p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in);                         \
+    p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in);                         \
+    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1);                   \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
+    p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3);                          \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in);                               \
+    tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0);                         \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
+    tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0);                   \
+    q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
+    q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in);                         \
+    tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
+    q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+  }
+
+#endif  // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index c948e12a39..eb530db5a1 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -189,6 +189,8 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_16_lsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c721e190b0..9cd58d3b80 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -442,7 +442,7 @@ ()
 specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
@@ -460,7 +460,7 @@ ()
 specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;

From 1365e7e1a56f0a9af5fbd247a973206484bc8e2b Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 1 Mar 2022 23:03:27 -0800
Subject: [PATCH 215/926] vp9-svc: Remove VP9E_SET_TEMPORAL_LAYERING_MODE

The control was never implemented, no need to keep this.
temporal_layering_mode is set in the config.

Bug: webm:1753
Change-Id: I9a6eb50e82344605ab62775911783af82ac2d401
---
 vpx/vp8cx.h | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 6b02aa8657..5665a5f036 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -494,18 +494,6 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_COLOR_SPACE,
 
-  /*!\brief Codec control function to set temporal layering mode.
-   * \note Valid ranges: 0..3, default is "0"
-   * (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING).
-   *                     0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING
-   *                     1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS
-   *                     2 = VP9E_TEMPORAL_LAYERING_MODE_0101
-   *                     3 = VP9E_TEMPORAL_LAYERING_MODE_0212
-   *
-   * Supported in codecs: VP9
-   */
-  VP9E_SET_TEMPORAL_LAYERING_MODE,
-
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *
    * By default the value is set as 4.
@@ -1026,9 +1014,6 @@ VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
 #define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_SPACE
-VPX_CTRL_USE_TYPE(VP9E_SET_TEMPORAL_LAYERING_MODE,
-                  int) /* VP9E_TEMPORAL_LAYERING_MODE */
-#define VPX_CTRL_VP9E_SET_TEMPORAL_LAYERING_MODE
 VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int)

From 624b1367004801639ed35759d5f1759a092c8410 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 1 Mar 2022 09:48:13 +0800
Subject: [PATCH 216/926] vp9[loongarch]: Optimize horizontal/vertical_8_c

1. vpx_lpf_vertical_8_lsx
2. vpx_lpf_horizontal_8_lsx

Bug: webm:1755

Change-Id: I6b05d6b1b2ac4d2a75beb9c9ca9700976fc3af55
---
 test/lpf_test.cc                     |  12 ++
 vpx_dsp/loongarch/loopfilter_8_lsx.c | 199 +++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk                   |   2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |   4 +-
 4 files changed, 215 insertions(+), 2 deletions(-)
 create mode 100644 vpx_dsp/loongarch/loopfilter_8_lsx.c

diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 62c6f30a07..833dfb9a89 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -692,4 +692,16 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
+#if HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dual_lsx,
+                   &vpx_lpf_horizontal_16_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c,
+                   8)));
+#endif  // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
+
 }  // namespace
diff --git a/vpx_dsp/loongarch/loopfilter_8_lsx.c b/vpx_dsp/loongarch/loopfilter_8_lsx.c
new file mode 100644
index 0000000000..facf6f30ec
--- /dev/null
+++ b/vpx_dsp/loongarch/loopfilter_8_lsx.c
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, thresh, b_limit, limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i p2_filter8, p1_filter8, p0_filter8;
+  __m128i q0_filter8, q1_filter8, q2_filter8;
+  __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
+  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
+  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = __lsx_vilvl_d(zero, flat);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+    __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+    __lsx_vstelm_d(q0_out, dst, 0, 0);
+    __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+  } else {
+    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+              p1_l, p0_l);
+    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+              q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero,
+              p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+              q0_filter8);
+    DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8,
+              q2_filter8);
+    DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filter8, flat, p1_out, p1_filter8, flat,
+              p0_out, p0_filter8, flat, q0_out, q0_filter8, flat, p2_out,
+              p1_out, p0_out, q0_out);
+    DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filter8, flat, q2, q2_filter8, flat,
+              q1_out, q2_out);
+    dst -= stride3;
+
+    __lsx_vstelm_d(p2_out, dst, 0, 0);
+    __lsx_vstelm_d(p1_out, dst + stride, 0, 0);
+    __lsx_vstelm_d(p0_out, dst + stride2, 0, 0);
+    __lsx_vstelm_d(q0_out, dst + stride3, 0, 0);
+
+    dst += stride4;
+    __lsx_vstelm_d(q1_out, dst, 0, 0);
+    dst += stride;
+    __lsx_vstelm_d(q2_out, dst, 0, 0);
+  }
+}
+
+void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p1_out, p0_out, q0_out, q1_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i vec0, vec1, vec2, vec3, vec4;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  uint8_t *dst_tmp = dst - 4;
+
+  /* load vector elements */
+  p3 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
+  p0 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  q0 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
+  q3 = __lsx_vldx(dst_tmp, stride3);
+
+  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+
+  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
+  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
+  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = __lsx_vilvl_d(zero, flat);
+
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    /* Store 4 pixels p1-_q1 */
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec2 = __lsx_vilvl_h(vec1, vec0);
+    vec3 = __lsx_vilvh_h(vec1, vec0);
+
+    dst -= 2;
+    __lsx_vstelm_w(vec2, dst, 0, 0);
+    __lsx_vstelm_w(vec2, dst + stride, 0, 1);
+    __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(vec3, dst, 0, 0);
+    __lsx_vstelm_w(vec3, dst + stride, 0, 1);
+    __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
+  } else {
+    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+              p1_l, p0_l);
+    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+              q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+              p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+    /* store pixel values */
+    p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    /* Store 6 pixels p2-_q2 */
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
+    vec2 = __lsx_vilvl_h(vec1, vec0);
+    vec3 = __lsx_vilvh_h(vec1, vec0);
+    vec4 = __lsx_vilvl_b(q2, q1);
+
+    dst -= 3;
+    __lsx_vstelm_w(vec2, dst, 0, 0);
+    __lsx_vstelm_h(vec4, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(vec2, dst, 0, 1);
+    __lsx_vstelm_h(vec4, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(vec2, dst, 0, 2);
+    __lsx_vstelm_h(vec4, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(vec2, dst, 0, 3);
+    __lsx_vstelm_h(vec4, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(vec3, dst, 0, 0);
+    __lsx_vstelm_h(vec4, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(vec3, dst, 0, 1);
+    __lsx_vstelm_h(vec4, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(vec3, dst, 0, 2);
+    __lsx_vstelm_h(vec4, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(vec3, dst, 0, 3);
+    __lsx_vstelm_h(vec4, dst, 4, 7);
+  }
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index eb530db5a1..976c652729 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -189,7 +189,9 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_lsx.h
 DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_16_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_8_lsx.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 9cd58d3b80..ce0780fdab 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -445,7 +445,7 @@ ()
 specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
@@ -463,7 +463,7 @@ ()
 specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;

From 642529248f873d9da8b86e368d9e3af85a2a77a3 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Sun, 13 Mar 2022 06:28:16 +0900
Subject: [PATCH 217/926] ads2gas[_apple].pl: remove unused stanzas

Many of the features in ads2gas are no longer used.
Remove all patterns which are no longer used in
libvpx.

Simplify between the two to minimize differences.

Change-Id: Ia1151eb8b694cbe51845a1374a876cc7b798899c
---
 build/make/ads2gas.pl            | 122 +++++--------------------------
 build/make/ads2gas_apple.pl      | 113 ++++++----------------------
 build/make/thumb.pm              |   5 +-
 vpx_dsp/arm/idct4x4_add_neon.asm |   2 +-
 4 files changed, 42 insertions(+), 200 deletions(-)

diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index b6a8f53eae..4b7a906d26 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -42,39 +42,11 @@
 
 while (<STDIN>)
 {
-    undef $comment;
-    undef $line;
-    $comment_char = ";";
-    $comment_sub = "@";
-
-    # Handle comments.
-    if (/$comment_char/)
-    {
-      $comment = "";
-      ($line, $comment) = /(.*?)$comment_char(.*)/;
-      $_ = $line;
-    }
-
     # Load and store alignment
     s/@/,:/g;
 
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
+    # Comment character
+    s/;/@/;
 
     # Convert ELSE to .else
     s/\bELSE\b/.else/g;
@@ -82,82 +54,31 @@
     # Convert ENDIF to .endif
     s/\bENDIF\b/.endif/g;
 
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert endfunc to nothing.
-    s/\bendfunc\b//ig;
-
-    # Convert FUNCTION to nothing.
-    s/\bFUNCTION\b//g;
-    s/\bfunction\b//g;
-
-    s/\bENTRY\b//g;
-    s/\bMSARMASM\b/0/g;
-    s/^\s+end\s+$//g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
     # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
+    if (s/\bIF\b/.if/g) {
         s/=+/==/g;
     }
 
     # Convert INCLUDE to .INCLUDE "file"
     s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
 
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
-
     # No AREA required
     # But ALIGNs in AREA must be obeyed
     s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
     # If no ALIGN, strip the AREA and align to 4 bytes
     s/^\s*AREA.*$/.text\n.p2align 2/;
 
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
-
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
-
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
-
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
+    # Make function visible to linker.
     if ($elf) {
         s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
     } else {
         s/EXPORT\s+\|([\$\w]*)\|/.global $1/;
     }
-    s/IMPORT\s+\|([\$\w]*)\|/.global $1/;
-
-    s/EXPORT\s+([\$\w]*)/.global $1/;
-    s/export\s+([\$\w]*)/.global $1/;
 
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
+    # No vertical bars on function names
+    s/^\|(\$?\w+)\|/$1/g;
 
     # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
     s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
 
     # ALIGN directive
@@ -165,7 +86,7 @@
 
     if ($thumb) {
         # ARM code - we force everything to thumb with the declaration in the header
-        s/\sARM//g;
+        s/\s+ARM//g;
     } else {
         # ARM code
         s/\sARM/.arm/g;
@@ -175,12 +96,8 @@
     s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
     s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
 
-    # NEON code
-    s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
-    s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
-
     if ($thumb) {
-        thumb::FixThumbInstructions($_, 0);
+        thumb::FixThumbInstructions($_);
     }
 
     # eabi_attributes numerical equivalents can be found in the
@@ -193,22 +110,21 @@
         # PRESERVE8 Stack 8-byte align is preserved
         s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
     } else {
-        s/\sREQUIRE8//;
-        s/\sPRESERVE8//;
+        s/\s+REQUIRE8//;
+        s/\s+PRESERVE8//;
     }
 
     # Use PROC and ENDP to give the symbols a .size directive.
     # This makes them show up properly in debugging tools like gdb and valgrind.
-    if (/\bPROC\b/)
-    {
+    if (/\bPROC\b/) {
         my $proc;
         /^_([\.0-9A-Z_a-z]\w+)\b/;
         $proc = $1;
         push(@proc_stack, $proc) if ($proc);
         s/\bPROC\b/@ $&/;
     }
-    if (/\bENDP\b/)
-    {
+
+    if (/\bENDP\b/) {
         my $proc;
         s/\bENDP\b/@ $&/;
         $proc = pop(@proc_stack);
@@ -220,18 +136,18 @@
 
     # Begin macro definition
     if (/\bMACRO\b/) {
+        # Process next line down, which will be the macro definition
         $_ = <STDIN>;
         s/^/.macro/;
-        s/\$//g;                # remove formal param reference
-        s/;/@/g;                # change comment characters
+        s/\$//g;             # Remove $ from the variables in the declaration
     }
 
-    # For macros, use \ to reference formal params
-    s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    s/\$/\\/g;               # Use \ to reference formal parameters
+    # End macro definition
+
+    s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
     print;
-    print "$comment_sub$comment\n" if defined $comment;
 }
 
 # Mark that this object doesn't need an executable stack.
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 848872fa7d..0a3fccc4b0 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -22,15 +22,12 @@
 print "@  using the ads2gas_apple.pl script.\n\n";
 print "\t.syntax unified\n";
 
-my %register_aliases;
 my %macro_aliases;
 
 my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9");
 
 my @incoming_array;
 
-my @imported_functions;
-
 # Perl trim function to remove whitespace from the start and end of the string
 sub trim($)
 {
@@ -46,25 +43,7 @@ ($)
     s/@/,:/g;
 
     # Comment character
-    s/;/ @/g;
-
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
+    s/;/@/;
 
     # Convert ELSE to .else
     s/\bELSE\b/.else/g;
@@ -72,100 +51,53 @@ ($)
     # Convert ENDIF to .endif
     s/\bENDIF\b/.endif/g;
 
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
     # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
-        s/=/==/g;
+    if (s/\bIF\b/.if/g) {
+        s/=+/==/g;
     }
 
     # Convert INCLUDE to .INCLUDE "file"
     s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
 
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
-
     # No AREA required
     # But ALIGNs in AREA must be obeyed
     s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
     # If no ALIGN, strip the AREA and align to 4 bytes
     s/^\s*AREA.*$/.text\n.p2align 2/;
 
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
+    # Make function visible to linker.
+    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/;
 
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
+    # No vertical bars on function names
+    s/^\|(\$?\w+)\|/$1/g;
 
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
+    # Labels and functions need a leading underscore and trailing colon
+    s/^([a-zA-Z_0-9\$]+)/_$1:/ if !/EQU/;
 
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
-
-    # Prepend imported functions with _
-    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
-    {
-        $function = trim($1);
-        push(@imported_functions, $function);
-    }
-
-    foreach $function (@imported_functions)
-    {
-        s/$function/_$function/;
-    }
-
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
-
-    # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
-    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
+    # Branches need to call the correct, underscored, function
+    s/^(\s+b[egln]?[teq]?\s+)([a-zA-Z_0-9\$]+)/$1 _$2/ if !/EQU/;
 
     # ALIGN directive
     s/\bALIGN\b/.balign/g;
 
     # Strip ARM
-    s/\sARM/@ ARM/g;
+    s/\s+ARM//;
 
     # Strip REQUIRE8
-    #s/\sREQUIRE8/@ REQUIRE8/g;
-    s/\sREQUIRE8/@ /g;
+    s/\s+REQUIRE8//;
 
     # Strip PRESERVE8
-    s/\sPRESERVE8/@ PRESERVE8/g;
+    s/\s+PRESERVE8//;
 
     # Strip PROC and ENDPROC
-    s/\bPROC\b/@/g;
-    s/\bENDP\b/@/g;
+    s/\bPROC\b//g;
+    s/\bENDP\b//g;
 
     # EQU directive
-    s/(.*)EQU(.*)/.set $1, $2/;
+    s/(\S+\s+)EQU(\s+\S+)/.set $1, $2/;
 
     # Begin macro definition
-    if (/\bMACRO\b/)
-    {
+    if (/\bMACRO\b/) {
         # Process next line down, which will be the macro definition
         $_ = <STDIN>;
 
@@ -187,16 +119,13 @@ ($)
         next;
     }
 
-    while (($key, $value) = each(%macro_aliases))
-    {
+    while (($key, $value) = each(%macro_aliases)) {
         $key =~ s/\$/\\\$/;
         s/$key\b/$value/g;
     }
+    # End macro definition
 
-    # For macros, use \ to reference formal params
-#   s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
-
     print;
 }
diff --git a/build/make/thumb.pm b/build/make/thumb.pm
index 9c49e2d8b7..ef4b316771 100644
--- a/build/make/thumb.pm
+++ b/build/make/thumb.pm
@@ -11,11 +11,8 @@
 
 package thumb;
 
-sub FixThumbInstructions($$)
+sub FixThumbInstructions($)
 {
-    my $short_branches = $_[1];
-    my $branch_shift_offset = $short_branches ? 1 : 0;
-
     # Write additions with shifts, such as "add r10, r11, lsl #8",
     # in three operand form, "add r10, r10, r11, lsl #8".
     s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g;
diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm
index 184d218941..175ba7fbc2 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -17,7 +17,7 @@
 
     INCLUDE vpx_dsp/arm/idct_neon.asm.S
 
-    AREA     Block, CODE, READONLY ; name this block of code
+    AREA     Block, CODE, READONLY
 ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input

From 4ee32be84be7dfa2b0c00ba04f4d85503d46e3f3 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Sun, 13 Mar 2022 07:02:03 +0900
Subject: [PATCH 218/926] ads2gas_apple.pl: remove gcc-isms

The gcc assembler was incompatible for a long
time. It is now based on clang and accepts
more modern syntax, although not enough to
remove the script entirely.

Change-Id: I667d29dca005ea02a995c1025c45eb844081f64b
---
 build/make/ads2gas_apple.pl | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 0a3fccc4b0..af10b436a9 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -94,35 +94,17 @@ ($)
     s/\bENDP\b//g;
 
     # EQU directive
-    s/(\S+\s+)EQU(\s+\S+)/.set $1, $2/;
+    s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;
 
     # Begin macro definition
     if (/\bMACRO\b/) {
         # Process next line down, which will be the macro definition
         $_ = <STDIN>;
-
-        $trimmed = trim($_);
-
-        # remove commas that are separating list
-        $trimmed =~ s/,//g;
-
-        # string to array
-        @incoming_array = split(/\s+/, $trimmed);
-
-        print ".macro @incoming_array[0]\n";
-
-        # remove the first element, as that is the name of the macro
-        shift (@incoming_array);
-
-        @macro_aliases{@incoming_array} = @mapping_list;
-
-        next;
+        s/^/.macro/;
+        s/\$//g;             # Remove $ from the variables in the declaration
     }
 
-    while (($key, $value) = each(%macro_aliases)) {
-        $key =~ s/\$/\\\$/;
-        s/$key\b/$value/g;
-    }
+    s/\$/\\/g;               # Use \ to reference formal parameters
     # End macro definition
 
     s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling

From 220643c8627d158f75acbf7e1b7dcd9ae642261c Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 1 Mar 2022 16:33:47 +0800
Subject: [PATCH 219/926] vp9[loongarch]: Optimize convolve8_horiz/vert/c

1. vpx_convolve8_lsx
2. vpx_convolve8_vert_lsx
3. vpx_convolve8_horiz_lsx

Bug: webm:1755

Change-Id: I9897e1ed6a904ac74d1078bd22b275af44db142d
---
 test/convolve_test.cc                       |  13 +
 vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c | 852 +++++++++++++++++++
 vpx_dsp/loongarch/vpx_convolve8_lsx.c       | 718 ++++++++++++++++
 vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c  | 870 ++++++++++++++++++++
 vpx_dsp/loongarch/vpx_convolve_lsx.h        | 117 +++
 vpx_dsp/vpx_dsp.mk                          |   6 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl                |   6 +-
 7 files changed, 2579 insertions(+), 3 deletions(-)
 create mode 100644 vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
 create mode 100644 vpx_dsp/loongarch/vpx_convolve8_lsx.c
 create mode 100644 vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
 create mode 100644 vpx_dsp/loongarch/vpx_convolve_lsx.h

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 4b2dadefac..94b2814842 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1449,6 +1449,19 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest,
                          ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
 
+#if HAVE_LSX
+const ConvolveFunctions convolve8_lsx(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_lsx,
+    vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_lsx, vpx_convolve8_avg_vert_c,
+    vpx_convolve8_lsx, vpx_convolve8_avg_c, vpx_scaled_horiz_c,
+    vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) };
+INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_lsx));
+#endif  // HAVE_LSX
+
 #if HAVE_VSX
 const ConvolveFunctions convolve8_vsx(
     vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx,
diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
new file mode 100644
index 0000000000..3608fe326c
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -0,0 +1,852 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
+  {                                                         \
+    _src0 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src1 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src2 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src3 = __lsx_vld(_src, 0);                             \
+  }
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out, out0, out1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1);
+  out = __lsx_vssrarni_b_h(out1, out0, 7);
+  out = __lsx_vxori_b(out, 128);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 3);
+}
+
+static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1);
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1,
+                             out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+}
+
+static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 1;
+  int32_t stride = src_stride << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    const uint8_t *_src = src + src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
+    DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out1, dst, 0);
+    dst += dst_stride;
+    src += stride;
+  }
+}
+
+static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+
+    dst += dst_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+    src3 = __lsx_vld(src, 56);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 32);
+    __lsx_vst(out1, dst, 48);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, vec0, vec1, res0, res1;
+  __m128i vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
+  DUP2_ARG2(__lsx_vsrari_h, vec2, FILTER_BITS, vec3, FILTER_BITS, vec2, vec3);
+  DUP2_ARG2(__lsx_vpickev_b, vec2, vec2, vec3, vec3, res0, res1);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i vec0, vec1, vec2, vec3, filt0;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i res0, res1, res2, res3;
+  __m128i vec4, vec5, vec6, vec7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  uint8_t *src_tmp1 = src + src_stride4;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+            src6);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+  DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
+            src7, src6, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6,
+            FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+            res0, res1, res2, res3);
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 1);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i filt0, mask;
+  __m128i src0, src1, src2, src3;
+  __m128i vec0, vec1, vec2, vec3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
+            FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
+  DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, src0, src1);
+
+  __lsx_vstelm_d(src0, dst, 0, 0);
+  __lsx_vstelm_d(src0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(src1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(src1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  __m128i filt0, mask;
+  __m128i src0, src1, src2, src3, out0, out1;
+  __m128i vec0, vec1, vec2, vec3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
+            FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+  dst += dst_stride;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
+            FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
+  DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+  dst += dst_stride;
+
+  if (height == 16) {
+    uint8_t *dst_tmp1 = dst + dst_stride4;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
+              FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
+              FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
+    DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
+
+    __lsx_vstelm_d(out0, dst_tmp1, 0, 0);
+    __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1);
+  }
+}
+
+static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2) - 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *src_tmp1 = src + 8;
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+  src6 = __lsx_vldx(src, src_stride3);
+  src1 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+            src5);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask,
+            src7, src7, mask, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            out0, out1, out2, out3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+            out4, out5, out6, out7);
+  DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
+            FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
+  DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
+            FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
+
+  tmp = __lsx_vpickev_b(out1, out0);
+  __lsx_vst(tmp, dst, 0);
+  dst += dst_stride;
+  tmp = __lsx_vpickev_b(out3, out2);
+  __lsx_vst(tmp, dst, 0);
+  dst += dst_stride;
+  tmp = __lsx_vpickev_b(out5, out4);
+  __lsx_vst(tmp, dst, 0);
+  dst += dst_stride;
+  tmp = __lsx_vpickev_b(out7, out6);
+  __lsx_vst(tmp, dst, 0);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src_tmp1 += src_stride4;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
+              FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
+              FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
+
+    tmp = __lsx_vpickev_b(out1, out0);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+    tmp = __lsx_vpickev_b(out3, out2);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+    tmp = __lsx_vpickev_b(out5, out4);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+    tmp = __lsx_vpickev_b(out7, out6);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6);
+    src7 = __lsx_vld(src, 24);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
+              FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
+              FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
+
+    tmp = __lsx_vpickev_b(out1, out0);
+    __lsx_vst(tmp, dst, 0);
+    tmp = __lsx_vpickev_b(out3, out2);
+    __lsx_vst(tmp, dst, 16);
+    dst += dst_stride;
+
+    tmp = __lsx_vpickev_b(out5, out4);
+    __lsx_vst(tmp, dst, 0);
+    tmp = __lsx_vpickev_b(out7, out6);
+    __lsx_vst(tmp, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+              src6);
+    src7 = __lsx_vld(src, 56);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
+              FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
+              FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
+
+    tmp = __lsx_vpickev_b(out1, out0);
+    __lsx_vst(tmp, dst, 0);
+    tmp = __lsx_vpickev_b(out3, out2);
+    __lsx_vst(tmp, dst, 16);
+    tmp = __lsx_vpickev_b(out5, out4);
+    __lsx_vst(tmp, dst, 32);
+    tmp = __lsx_vpickev_b(out7, out6);
+    __lsx_vst(tmp, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+
+      case 16:
+        common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+
+      case 32:
+        common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+
+      case 64:
+        common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
new file mode 100644
index 0000000000..51a162bf3e
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -0,0 +1,718 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
+  {                                                         \
+    _src0 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src1 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src2 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src3 = __lsx_vld(_src, 0);                             \
+  }
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i out0, out1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= (3 + 3 * src_stride);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+  src5 = __lsx_vld(src, 0);
+  src += src_stride;
+  src6 = __lsx_vld(src, 0);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+
+  for (; loop_cnt--;) {
+    LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+    tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+    out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+    src0 = __lsx_vpackev_b(src1, src0);
+    out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    tmp5 = src1;
+    tmp0 = tmp2;
+    tmp1 = tmp4;
+    tmp2 = src0;
+  }
+}
+
+static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  __m128i out0, out1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= (3 + 3 * src_stride);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+  src5 = __lsx_vld(src, 0);
+  src += src_stride;
+  src6 = __lsx_vld(src, 0);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+            tmp0, tmp1, tmp2, tmp4);
+  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+  for (; loop_cnt--;) {
+    LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp3 = __lsx_vpackev_b(src7, src6);
+    out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vpackev_b(src8, src7);
+    out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src1 = __lsx_vpackev_b(src9, src8);
+    src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+    src2 = __lsx_vpackev_b(src10, src9);
+    src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    src6 = src10;
+    tmp0 = tmp2;
+    tmp1 = tmp3;
+    tmp2 = src1;
+    tmp4 = tmp6;
+    tmp5 = src0;
+    tmp6 = src2;
+  }
+}
+
+static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  src += 8;
+  dst += 8;
+
+  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  src += 8;
+  dst += 8;
+}
+
+static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_vt, filt_hz, vec0, vec1, res0, res1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vpickev_b, tmp0, tmp0, tmp1, tmp1, res0, res1);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  __m128i res0, res1, res2, res3;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += src_stride4;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+
+  DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+            hz_out1, hz_out3);
+  hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+  hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+            hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+            filt_vt, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6,
+            FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+            res0, res1, res2, res3);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+  dst += dst_stride4;
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else if (height == 8) {
+    common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask, out0, out1;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+
+  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
+            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
+                                          int32_t src_stride, uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int8_t *filter_horiz,
+                                          int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, mask, out0, out1;
+  __m128i filt_hz, filt_vt, vec0;
+  __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP2_ARG2(__lsx_vsrari_h, tmp3, FILTER_BITS, tmp4, FILTER_BITS, tmp3, tmp4);
+    DUP2_ARG2(__lsx_vpickev_b, tmp2, tmp1, tmp4, tmp3, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp5 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp6 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp7 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp8 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP4_ARG2(__lsx_vsrari_h, tmp5, FILTER_BITS, tmp6, FILTER_BITS, tmp7,
+              FILTER_BITS, tmp8, FILTER_BITS, tmp5, tmp6, tmp7, tmp8);
+    DUP2_ARG2(__lsx_vpickev_b, tmp6, tmp5, tmp8, tmp7, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else {
+    common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1;
+  __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (; loop_cnt--;) {
+    uint8_t *src_tmp0 = src + 8;
+
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src,
+              src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
+    src += src_stride4;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
+    tmp = __lsx_vpickev_b(tmp2, tmp1);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
+    tmp = __lsx_vpickev_b(tmp2, tmp1);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
+    tmp = __lsx_vpickev_b(tmp2, tmp1);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
+    tmp = __lsx_vpickev_b(tmp2, tmp1);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                            filter_vert, height);
+  src += 16;
+  dst += 16;
+
+  common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                            filter_vert, height);
+  src += 16;
+  dst += 16;
+}
+
+static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
+                       int32_t y_step_q4, int32_t w, int32_t h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
new file mode 100644
index 0000000000..c0bb10f3b7
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -0,0 +1,870 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i reg0, reg1, reg2, reg3, reg4;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1;
+  uint8_t *_src = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+  reg2 = __lsx_vilvl_d(tmp5, tmp2);
+  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+  reg2 = __lsx_vxori_b(reg2, 128);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+    DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+    out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
+                               filter2, filter3);
+    out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
+                               filter2, filter3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = reg3;
+    reg2 = reg4;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  src = src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
+                               filter2, filter3);
+    out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
+                               filter2, filter3);
+    out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
+                               filter2, filter3);
+    out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = tmp0;
+    reg2 = tmp2;
+    reg3 = reg5;
+    reg4 = tmp1;
+    reg5 = tmp3;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  // uint8_t *_src = (uint8_t *)src - src_stride3;
+  src -= src_stride3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6,
+            reg7, reg8, reg9);
+  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              src4, src5, src7, src8);
+    tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+                               filter2, filter3);
+    tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+                               filter2, filter3);
+    tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+                               filter2, filter3);
+    tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(tmp1, dst, 0);
+    dst += dst_stride;
+    tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+                               filter2, filter3);
+    tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+                               filter2, filter3);
+    tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+                               filter2, filter3);
+    tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(tmp1, dst, 0);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = src0;
+    reg2 = src2;
+    reg3 = reg5;
+    reg4 = src1;
+    reg5 = src3;
+    reg6 = reg8;
+    reg7 = src4;
+    reg8 = src7;
+    reg9 = reg11;
+    reg10 = src5;
+    reg11 = src8;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t cnt = width >> 4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  src -= src_stride3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; cnt--;) {
+    uint32_t loop_cnt = height >> 2;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_tmp, src_stride3);
+    src_tmp += src_stride4;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+              src6);
+    src_tmp += src_stride3;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+    src6 = __lsx_vxori_b(src6, 128);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg6, reg7, reg8, reg9);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+    for (; loop_cnt--;) {
+      src7 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+                src9);
+      src10 = __lsx_vldx(src_tmp, src_stride3);
+      src_tmp += src_stride4;
+      DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                src7, src8, src9, src10);
+      DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src0, src1, src2, src3);
+      DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src4, src5, src7, src8);
+      tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      __lsx_vst(tmp0, dst_tmp, 0);
+      __lsx_vstx(tmp1, dst_tmp, dst_stride);
+      tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      __lsx_vstx(tmp0, dst_tmp, dst_stride2);
+      __lsx_vstx(tmp1, dst_tmp, dst_stride3);
+      dst_tmp += dst_stride4;
+
+      reg0 = reg2;
+      reg1 = src0;
+      reg2 = src2;
+      reg3 = reg5;
+      reg4 = src1;
+      reg5 = src3;
+      reg6 = reg8;
+      reg7 = src4;
+      reg8 = src7;
+      reg9 = reg11;
+      reg10 = src5;
+      reg11 = src8;
+      src6 = src10;
+    }
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+                            32);
+}
+
+static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+                            64);
+}
+
+static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i src10_l, src32_l, src21_l, src43_l, src2110, src4332;
+  __m128i filt0, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += (src_stride4 + src_stride);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_l, src21_l, src32_l, src43_l);
+  DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+  src2110 = __lsx_vpickev_b(tmp1, tmp0);
+
+  __lsx_vstelm_w(src2110, dst, 0, 0);
+  __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+  __m128i src65_l, src87_l, src2110, src4332, src6554, src8776;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt0;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  uint8_t *dst_tmp1 = dst + dst_stride4;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += (src_stride4 + src_stride);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_l, src21_l, src32_l, src43_l);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+            src54_l, src65_l, src76_l, src87_l);
+  DUP4_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+            src87_l, src76_l, src2110, src4332, src6554, src8776);
+  DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
+            src8776, filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
+            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+
+  __lsx_vstelm_w(src2110, dst, 0, 0);
+  __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3);
+
+  __lsx_vstelm_w(src4332, dst_tmp1, 0, 0);
+  __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride, 0, 1);
+  __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride2, 0, 2);
+  __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  __m128i out0, out1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  /* rearranging filter_y */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            tmp0, tmp1, tmp2, tmp3);
+  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
+            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+}
+
+static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i out0, out1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7)
+    src8 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
+              FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+    dst += dst_stride4;
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
+              FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+    dst += dst_stride4;
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int y0_q4,
+                                 int y_step_q4, int w, int height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  filt0 = __lsx_vldrepl_h(&filt_ver[3], 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(tmp4, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
+    __lsx_vst(tmp4, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(tmp4, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
+    __lsx_vst(tmp4, dst, 0);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  uint8_t *src_tmp;
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src5 = __lsx_vld(src, 16);
+  src += src_stride;
+  src_tmp = src + 16;
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src7, src3, src8);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    src += src_stride4;
+    src_tmp += src_stride4;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(tmp4, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
+    __lsx_vstx(tmp4, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vstx(tmp4, dst, dst_stride2);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
+    __lsx_vstx(tmp4, dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(tmp4, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    dst += dst_stride;
+    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
+    __lsx_vst(tmp4, dst, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    dst += dst_stride;
+    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(tmp4, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    dst += dst_stride;
+    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
+    __lsx_vst(tmp4, dst, 16);
+
+    dst += dst_stride;
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *dst_tmp1 = dst + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+            src9);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    uint8_t *src_tmp0 = src + src_stride;
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+              src10);
+    DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48,
+              src2, src5, src8, src11);
+    src += src_stride2;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(tmp, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    tmp = __lsx_vpickev_b(tmp3, tmp2);
+    __lsx_vst(tmp, dst_tmp1, 0);
+
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5);
+    DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5);
+    tmp = __lsx_vpickev_b(tmp5, tmp4);
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7);
+    DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7);
+    tmp = __lsx_vpickev_b(tmp7, tmp6);
+    __lsx_vst(tmp, dst_tmp1, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(tmp, dst, 32);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    tmp = __lsx_vpickev_b(tmp3, tmp2);
+    __lsx_vst(tmp, dst_tmp1, 32);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5);
+    DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5);
+    tmp = __lsx_vpickev_b(tmp5, tmp4);
+    __lsx_vst(tmp, dst, 48);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7);
+    DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7);
+    tmp = __lsx_vpickev_b(tmp7, tmp6);
+    __lsx_vst(tmp, dst_tmp1, 48);
+    dst += dst_stride2;
+    dst_tmp1 += dst_stride2;
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_lsx(src, src_stride, dst, dst_stride, filter, y0_q4,
+                             y_step_q4, w, h);
+        break;
+      case 32:
+        common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h
new file mode 100644
index 0000000000..2fdb93db84
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+#include "vpx_dsp/vpx_filter.h"
+
+#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, \
+                            _filter2, _filter3)                             \
+  ({                                                                        \
+    __m128i _vec0, _vec1;                                                   \
+                                                                            \
+    _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);                                \
+    _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);                      \
+    _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);                                \
+    _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);                      \
+    _vec0 = __lsx_vsadd_h(_vec0, _vec1);                                    \
+                                                                            \
+    _vec0;                                                                  \
+  })
+
+#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3,          \
+                        _filt_h0, _filt_h1, _filt_h2, _filt_h3)                \
+  ({                                                                           \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3;                                        \
+    __m128i _out;                                                              \
+                                                                               \
+    DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1,       \
+              _src1, _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \
+                               _filt_h2, _filt_h3);                            \
+    _out = __lsx_vsrari_h(_out, FILTER_BITS);                                  \
+    _out = __lsx_vsat_h(_out, 7);                                              \
+                                                                               \
+    _out;                                                                      \
+  })
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
+                                   _mask2, _mask3, _filter0, _filter1,         \
+                                   _filter2, _filter3, _out0, _out1)           \
+  {                                                                            \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
+    __m128i _reg0, _reg1, _reg2, _reg3;                                        \
+                                                                               \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0,       \
+              _tmp0, _tmp1);                                                   \
+    DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1,       \
+              _tmp2, _tmp3);                                                   \
+    DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3,         \
+              _filter1, _reg0, _reg1);                                         \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2,       \
+              _tmp4, _tmp5);                                                   \
+    DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3,       \
+              _tmp6, _tmp7);                                                   \
+    DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7,         \
+              _filter3, _reg2, _reg3);                                         \
+    DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1);        \
+  }
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(                                            \
+    _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0,      \
+    _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3)                  \
+  {                                                                            \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
+    __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7;            \
+                                                                               \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0,       \
+              _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2,         \
+              _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3);          \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2,       \
+              _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2,         \
+              _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7);          \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1,       \
+              _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \
+              _tmp7);                                                          \
+    DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5,         \
+              _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
+              _reg1, _reg2, _reg3);                                            \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3,       \
+              _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \
+              _tmp7);                                                          \
+    DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5,         \
+              _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
+              _reg5, _reg6, _reg7);                                            \
+    DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3,  \
+              _reg7, _out0, _out1, _out2, _out3);                              \
+  }
+
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
+  ({                                                     \
+    __m128i tmp0_m;                                      \
+    __m128i tmp1_m;                                      \
+                                                         \
+    tmp0_m = __lsx_vshuf_b(in1, in0, mask);              \
+    tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);             \
+    tmp1_m = __lsx_vsrari_h(tmp1_m, shift);              \
+                                                         \
+    tmp1_m;                                              \
+  })
+
+#endif  // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 976c652729..91a983d097 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -163,6 +163,12 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
 
 DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
 
+# common (lsx)
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h
+
 # loop filters
 DSP_SRCS-yes += loopfilter.c
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ce0780fdab..49eea44389 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -374,13 +374,13 @@ ()
 specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;

From 31441d45f76819bb80dfc76f0a0f59f2501239e8 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Thu, 10 Mar 2022 14:56:42 +0800
Subject: [PATCH 220/926] vp9[loongarch]: Optimize convolve/convolve8_avg_c

1. vpx_convolve8_avg_lsx
2. vpx_convolve_avg_lsx

Bug: webm:1755

Change-Id: I4af5c362a94f11d0b5d1760e18326660bdbc0559
---
 test/convolve_test.cc                     |   4 +-
 vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 748 ++++++++++++++++++++++
 vpx_dsp/loongarch/vpx_convolve_avg_lsx.c  | 321 ++++++++++
 vpx_dsp/vpx_dsp.mk                        |   2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |   4 +-
 5 files changed, 1075 insertions(+), 4 deletions(-)
 create mode 100644 vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
 create mode 100644 vpx_dsp/loongarch/vpx_convolve_avg_lsx.c

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 94b2814842..a631ec77f7 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1451,9 +1451,9 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest,
 
 #if HAVE_LSX
 const ConvolveFunctions convolve8_lsx(
-    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_lsx,
+    vpx_convolve_copy_c, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx,
     vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_lsx, vpx_convolve8_avg_vert_c,
-    vpx_convolve8_lsx, vpx_convolve8_avg_c, vpx_scaled_horiz_c,
+    vpx_convolve8_lsx, vpx_convolve8_avg_lsx, vpx_scaled_horiz_c,
     vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
new file mode 100644
index 0000000000..27f5b5ca4f
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -0,0 +1,748 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i out0, out1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    src2 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src4 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src5 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
+    src2 = __lsx_vilvl_d(src3, src2);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+    tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+    out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+    src0 = __lsx_vpackev_b(src1, src0);
+    out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    out0 = __lsx_vavgr_bu(out0, src2);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    tmp5 = src1;
+    tmp0 = tmp2;
+    tmp1 = tmp4;
+    tmp2 = src0;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  __m128i out0, out1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+            tmp0, tmp1, tmp2, tmp4);
+  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp3 = __lsx_vpackev_b(src7, src6);
+    out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vpackev_b(src8, src7);
+    out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src1 = __lsx_vpackev_b(src9, src8);
+    src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+    src2 = __lsx_vpackev_b(src10, src9);
+    src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    src5 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src7 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src8 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src9 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    src6 = src10;
+    tmp0 = tmp2;
+    tmp1 = tmp3;
+    tmp2 = src1;
+    tmp4 = tmp6;
+    tmp5 = src0;
+    tmp6 = src2;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                        filter_horiz, filter_vert, height);
+  src += 8;
+  dst += 8;
+
+  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                        filter_horiz, filter_vert, height);
+  src += 8;
+  dst += 8;
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1;
+  __m128i dst0, dst1, dst2, dst3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, out;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+
+  dst0 = __lsx_vldrepl_w(dst, 0);
+  dst1 = __lsx_vldrepl_w(dst + dst_stride, 0);
+  dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0);
+  dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+  out = __lsx_vpickev_b(tmp1, tmp0);
+  out = __lsx_vavgr_bu(out, dst0);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  __lsx_vstelm_w(out, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(out, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(out, dst + dst_stride3, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += src_stride4;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+  DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+            hz_out1, hz_out3);
+  hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+  hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+
+  dst0 = __lsx_vldrepl_w(dst, 0);
+  dst += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst, 0);
+  dst += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst, 0);
+  dst += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst, 0);
+  dst += dst_stride;
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst, 0);
+  dst += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst, 0);
+  dst += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst, 0);
+  dst += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst, 0);
+  dst += dst_stride;
+  dst1 = __lsx_vilvl_w(dst2, dst1);
+  dst2 = __lsx_vilvl_w(dst4, dst3);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+            hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+            filt_vt, tmp0, tmp1, tmp2, tmp3);
+  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
+            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1);
+  DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
+
+  dst_tmp1 = dst;
+  __lsx_vstelm_w(res0, dst_tmp1, 0, 0);
+  dst_tmp1 += dst_stride;
+  __lsx_vstelm_w(res0, dst_tmp1, 0, 1);
+  dst_tmp1 += dst_stride;
+  __lsx_vstelm_w(res0, dst_tmp1, 0, 2);
+  dst_tmp1 += dst_stride;
+  __lsx_vstelm_w(res0, dst_tmp1, 0, 3);
+  dst_tmp1 += dst_stride;
+
+  __lsx_vstelm_w(res1, dst_tmp1, 0, 0);
+  dst_tmp1 += dst_stride;
+  __lsx_vstelm_w(res1, dst_tmp1, 0, 1);
+  dst_tmp1 += dst_stride;
+  __lsx_vstelm_w(res1, dst_tmp1, 0, 2);
+  dst_tmp1 += dst_stride;
+  __lsx_vstelm_w(res1, dst_tmp1, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else if (height == 8) {
+    common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *dst_tmp = dst;
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filtrt_ver, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += (src_stride4 + src_stride);
+
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp3 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
+            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
+  PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+  dst -= dst_stride * 3;
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *dst_tmp = dst;
+
+  /* rearranging filter */
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vlds(src, src_stride3);
+    src += src_stride4;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp0 = __lsx_vavgr_bu(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp1 = __lsx_vavgr_bu(vec0, filt_vt);
+
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = __lsx_vavgr_bu(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp3 = __lsx_vavgr_bu(vec0, filt_vt);
+
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1);
+
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else {
+    common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  uint8_t *src_tmp1;
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src_tmp1 = (uint8_t *)(src + 8);
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src5 = __lsx_vldx(src_tmp1, src_stride3);
+    src += src_stride4;
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst0);
+    __lsx_vst(tmp3, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst1);
+    __lsx_vst(tmp3, dst, 0);
+    dst += dst_stride;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst2);
+    __lsx_vst(tmp3, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst3);
+    __lsx_vst(tmp3, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                         filter_horiz, filter_vert, height);
+  src += 16;
+  dst += 16;
+
+  common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                         filter_horiz, filter_vert, height);
+  src += 16;
+  dst += 16;
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
new file mode 100644
index 0000000000..1dad29eeed
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
@@ -0,0 +1,321 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1;
+  __m128i dst0, dst1;
+
+  int32_t src_stride2 = src_stride << 1;
+
+  if ((height % 2) == 0) {
+    for (cnt = (height / 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      src1 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      dst0 = __lsx_vld(dst, 0);
+      dst1 = __lsx_vldx(dst, dst_stride);
+      DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
+
+      __lsx_vstelm_w(dst0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_w(dst1, dst, 0, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 4);
+  __m128i src0, src1, src2, src3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  for (; cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst3, dst, 0, 0);
+    dst += dst_stride;
+  }
+}
+
+static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 8);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  for (; cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+    src7 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+    dst4 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6);
+    dst7 = __lsx_vldx(dst, dst_stride3);
+    dst -= dst_stride4;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst1, dst, dst_stride);
+    __lsx_vstx(dst2, dst, dst_stride2);
+    __lsx_vstx(dst3, dst, dst_stride3);
+    dst += dst_stride4;
+    __lsx_vst(dst4, dst, 0);
+    __lsx_vstx(dst5, dst, dst_stride);
+    __lsx_vstx(dst6, dst, dst_stride2);
+    __lsx_vstx(dst7, dst, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 8);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  for (; cnt--;) {
+    uint8_t *dst_tmp = dst;
+    uint8_t *dst_tmp1 = dst_tmp + 16;
+    uint8_t *src_tmp = src + 16;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1);
+    DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+              dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5);
+    DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6,
+              dst7);
+    dst_tmp += dst_stride4;
+    dst_tmp1 += dst_stride4;
+
+    src_tmp = src + 16;
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src10, src11, src12, src13);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9);
+    DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+              dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13);
+    DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14,
+              dst15);
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+    DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+              dst11, dst8, dst9, dst10, dst11);
+    DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+              dst15, dst12, dst13, dst14, dst15);
+
+    dst_tmp = dst + 16;
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst2, dst, dst_stride);
+    __lsx_vstx(dst4, dst, dst_stride2);
+    __lsx_vstx(dst6, dst, dst_stride3);
+    __lsx_vst(dst1, dst_tmp, 0);
+    __lsx_vstx(dst3, dst_tmp, dst_stride);
+    __lsx_vstx(dst5, dst_tmp, dst_stride2);
+    __lsx_vstx(dst7, dst_tmp, dst_stride3);
+    dst += dst_stride4;
+
+    __lsx_vst(dst8, dst, 0);
+    __lsx_vstx(dst10, dst, dst_stride);
+    __lsx_vstx(dst12, dst, dst_stride2);
+    __lsx_vstx(dst14, dst, dst_stride3);
+    __lsx_vst(dst9, dst_tmp1, 0);
+    __lsx_vstx(dst11, dst_tmp1, dst_stride);
+    __lsx_vstx(dst13, dst_tmp1, dst_stride2);
+    __lsx_vstx(dst15, dst_tmp1, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 4);
+  uint8_t *dst_tmp = dst;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (; cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+              src7);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10,
+              src11);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14,
+              src15);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst0, dst1, dst2, dst3);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst4, dst5, dst6, dst7);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst8, dst9, dst10, dst11);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst12, dst13, dst14, dst15);
+    dst_tmp += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+    DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+              dst11, dst8, dst9, dst10, dst11);
+    DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+              dst15, dst12, dst13, dst14, dst15);
+
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    __lsx_vst(dst2, dst, 32);
+    __lsx_vst(dst3, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst4, dst, 0);
+    __lsx_vst(dst5, dst, 16);
+    __lsx_vst(dst6, dst, 32);
+    __lsx_vst(dst7, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst8, dst, 0);
+    __lsx_vst(dst9, dst, 16);
+    __lsx_vst(dst10, dst, 32);
+    __lsx_vst(dst11, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst12, dst, 0);
+    __lsx_vst(dst13, dst, 16);
+    __lsx_vst(dst14, dst, 32);
+    __lsx_vst(dst15, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4,
+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                          int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  switch (w) {
+    case 4: {
+      avg_width4_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+
+    case 8: {
+      avg_width8_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      avg_width16_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_width32_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_width64_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int32_t lp, cnt;
+      for (cnt = h; cnt--;) {
+        for (lp = 0; lp < w; ++lp) {
+          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+        }
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 91a983d097..a880e1d285 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -167,6 +167,8 @@ DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h
 
 # loop filters
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 49eea44389..8b66722481 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -371,7 +371,7 @@ ()
 specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/;
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
@@ -383,7 +383,7 @@ ()
 specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;

From bf672f23a5336cb54dbcb2e4417142139f44cc3e Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Fri, 11 Mar 2022 10:56:07 +0800
Subject: [PATCH 221/926] vp8[loongarch]: Optimize idct_add, filter_bv/bh

1. vp8_dc_only_idct_add_lsx
2. vp8_loop_filter_bh_lsx
3. vp8_loop_filter_bv_lsx

Bug: webm:1755

Change-Id: I9b629767e2a4e9db8cbb3ee2369186502dc6eb00
---
 vp8/common/loongarch/idct_lsx.c               |  54 +++
 vp8/common/loongarch/loopfilter_filters_lsx.c | 352 ++++++++++++++++++
 vp8/common/rtcd_defs.pl                       |   6 +-
 vp8/vp8_common.mk                             |   1 +
 4 files changed, 410 insertions(+), 3 deletions(-)
 create mode 100644 vp8/common/loongarch/idct_lsx.c

diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c
new file mode 100644
index 0000000000..fb0b0384c4
--- /dev/null
+++ b/vp8/common/loongarch/idct_lsx.c
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred,
+                                 int32_t pred_stride, uint8_t *dest,
+                                 int32_t dest_stride) {
+  __m128i vec, res0, res1, res2, res3, dst0, dst1;
+  __m128i pred0, pred1, pred2, pred3;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+
+  vec = __lsx_vreplgr2vr_h(in_dc);
+  vec = __lsx_vsrari_h(vec, 3);
+  pred0 = __lsx_vld(pred, 0);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred1, pred2);
+  pred3 = __lsx_vldx(pred, pred_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+            res1, res2, res3);
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, dst0, dst1);
+  dst0 = __lsx_vpickev_w(dst1, dst0);
+  __lsx_vstelm_w(dst0, dest, 0, 0);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 1);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 2);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 3);
+}
+
+void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride) {
+  idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c
index 484b3d6ad0..c48f794840 100644
--- a/vp8/common/loongarch/loopfilter_filters_lsx.c
+++ b/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -13,6 +13,41 @@
 #include "vp8/common/loopfilter.h"
 #include "vpx_util/loongson_intrinsics.h"
 
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev)        \
+  {                                                          \
+    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+    const __m128i cnst4b = __lsx_vldi(4);                    \
+    const __m128i cnst3b = __lsx_vldi(3);                    \
+                                                             \
+    p1_m = __lsx_vxori_b(p1, 0x80);                          \
+    p0_m = __lsx_vxori_b(p0, 0x80);                          \
+    q0_m = __lsx_vxori_b(q0, 0x80);                          \
+    q1_m = __lsx_vxori_b(q1, 0x80);                          \
+                                                             \
+    filt = __lsx_vssub_b(p1_m, q1_m);                        \
+    filt = __lsx_vand_v(filt, hev);                          \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vand_v(filt, mask);                         \
+    t1 = __lsx_vsadd_b(filt, cnst4b);                        \
+    t1 = __lsx_vsra_b(filt, cnst3b);                         \
+    t2 = __lsx_vsadd_b(filt, cnst3b);                        \
+    t2 = __lsx_vsra_b(filt, cnst3b);                         \
+    q0_m = __lsx_vssub_b(q0_m, t1);                          \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                          \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                          \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                          \
+    filt = __lsx_vsrari_b(t1, 1);                            \
+    hev = __lsx_vxori_b(hev, 0xff);                          \
+    filt = __lsx_vand_v(filt, hev);                          \
+    q1_m = __lsx_vssub_b(q1_m, filt);                        \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                          \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                        \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                          \
+  }
+
 #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
   {                                                     \
     __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;         \
@@ -116,6 +151,279 @@
     __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx);           \
   }
 
+static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit0_ptr,
+                                              const uint8_t *limit0_ptr,
+                                              const uint8_t *thresh0_ptr,
+                                              const uint8_t *b_limit1_ptr,
+                                              const uint8_t *limit1_ptr,
+                                              const uint8_t *thresh1_ptr) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+
+  temp_src = src - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3, p2, p1, p0);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
+  thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr);
+  b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vreplgr2vr_b(*limit0_ptr);
+  limit1 = __lsx_vreplgr2vr_b(*limit1_ptr);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  __lsx_vstx(p1, src, -pitch_x2);
+  __lsx_vstx(p0, src, -pitch);
+  __lsx_vst(q0, src, 0);
+  __lsx_vstx(q1, src, pitch);
+}
+
+static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                            const uint8_t *b_limit0_ptr,
+                                            const uint8_t *limit0_ptr,
+                                            const uint8_t *thresh0_ptr,
+                                            const uint8_t *b_limit1_ptr,
+                                            const uint8_t *limit1_ptr,
+                                            const uint8_t *thresh1_ptr) {
+  uint8_t *src_tmp0 = src - 4;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  row0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2);
+  row3 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+  row4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6);
+  row7 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+
+  row8 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10);
+  row11 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+  row12 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14);
+  row15 = __lsx_vldx(src_tmp0, pitch_x3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src -= 2;
+  __lsx_vstelm_w(tmp2, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp3, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp4, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp5, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 3);
+}
+
+static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+  thresh = __lsx_vreplgr2vr_b(thresh_in);
+  limit = __lsx_vreplgr2vr_b(limit_in);
+  b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+  DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2,
+            src_u, -pitch, p3_u, p2_u, p1_u, p0_u);
+  q0_u = __lsx_vld(src_u, 0);
+  DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u);
+  q3_u = __lsx_vldx(src_u, pitch_x3);
+
+  DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2,
+            src_v, -pitch, p3_v, p2_v, p1_v, p0_v);
+  q0_v = __lsx_vld(src_v, 0);
+  DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v);
+  q3_v = __lsx_vldx(src_v, pitch_x3);
+
+  /* right 8 element of p3 are u pixel and
+     left 8 element of p3 are v pixel */
+  DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+            q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  __lsx_vstelm_d(p1, src_u, 0, 0);
+  __lsx_vstelm_d(p0, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(q0, src_u + pitch_x2, 0, 0);
+  __lsx_vstelm_d(q1, src_u + pitch_x3, 0, 0);
+
+  __lsx_vstelm_d(p1, src_v, 0, 1);
+  __lsx_vstelm_d(p0, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(q0, src_v + pitch_x2, 0, 1);
+  __lsx_vstelm_d(q1, src_v + pitch_x3, 0, 1);
+}
+
+static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+                                             int32_t pitch,
+                                             const uint8_t b_limit_in,
+                                             const uint8_t limit_in,
+                                             const uint8_t thresh_in) {
+  uint8_t *src_u_tmp, *src_v_tmp;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  thresh = __lsx_vreplgr2vr_b(thresh_in);
+  limit = __lsx_vreplgr2vr_b(limit_in);
+  b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+  src_u_tmp = src_u - 4;
+  row0 = __lsx_vld(src_u_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2);
+  row3 = __lsx_vldx(src_u_tmp, pitch_x3);
+  src_u_tmp += pitch_x4;
+  row4 = __lsx_vld(src_u_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6);
+  row7 = __lsx_vldx(src_u_tmp, pitch_x3);
+
+  src_v_tmp = src_v - 4;
+  row8 = __lsx_vld(src_v_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10);
+  row11 = __lsx_vldx(src_v_tmp, pitch_x3);
+  src_v_tmp += pitch_x4;
+  row12 = __lsx_vld(src_v_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14);
+  row15 = __lsx_vldx(src_v_tmp, pitch_x3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+
+  tmp0 = __lsx_vilvl_b(p0, q1);
+  tmp1 = __lsx_vilvl_b(q1, q0);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src_u_tmp += 2;
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3);
+
+  __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3);
+
+  src_v_tmp += 2;
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3);
+
+  __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3);
+}
+
 static inline void mbloop_filter_horizontal_edge_y_lsx(
     uint8_t *src, int32_t pitch, const uint8_t b_limit_in,
     const uint8_t limit_in, const uint8_t thresh_in) {
@@ -391,3 +699,47 @@ void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
                                        *lpf_info_ptr->hev_thr);
   }
 }
+
+void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                            int32_t pitch_y, int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr) {
+  loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  if (src_u) {
+    loop_filter_horizontal_edge_uv_lsx(
+        src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v,
+        *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                            int32_t pitch_y, int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr) {
+  loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  if (src_u) {
+    loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v,
+                                     *lpf_info_ptr->blim, *lpf_info_ptr->lim,
+                                     *lpf_info_ptr->hev_thr);
+  }
+}
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 40117e3677..32601b4eba 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -50,13 +50,13 @@ ()
 specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi lsx/;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
@@ -108,7 +108,7 @@ ()
 
 #idct1_scalar_add
 add_proto qw/void vp8_dc_only_idct_add/, "short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
-specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/;
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/;
 
 #
 # RECON
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 909924ce8d..d485965d3d 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -127,6 +127,7 @@ endif
 # common (loongarch LSX intrinsics)
 VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c
 VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/idct_lsx.c
 
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.c

From f79d256cb28a4228df66a7a6d1cebbd9071e0639 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Fri, 11 Mar 2022 20:19:25 +0200
Subject: [PATCH 222/926] Make sure only NEON FDCT functions are called.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[NEON]
Added vpx_fdct4x4_pass1_neon(),
Added vpx_fdct8x8_pass1_notranspose_neon(),
Added vpx_fdct8x8_pass1_neon() to avoid code duplication
Refactored vpx_fdct4x4_neon() and vpx_dct8x8_neon() to use the above
Rename dct_body to vpx_fdct16x16_body to reuse later
Add transpose_s16_16x16()

I have run make test and all tests/configurations seem to pass.

Profiled using this command on an Ampere Altra VM:
sudo perf record -g ./vpxenc --codec=vp9 --height=1080 --width=1920 \
   --fps=25/1 --limit=20 -o output.mkv \
   ../original_videos_Sports_1080P_Sports_1080P-0063.mkv --debug –rt

Before this optimization:
1.32%     1.32%  vpxenc   vpxenc              [.] vpx_fdct4x4_neon
0.16%     0.16%  vpxenc   vpxenc              [.] vpx_fdct4x4_c
0.79%     0.79%  vpxenc   vpxenc              [.] vpx_fdct8x8_c
0.52%     0.52%  vpxenc   vpxenc              [.] vpx_fdct8x8_neon
1.23%     1.23%  vpxenc   vpxenc              [.] vpx_fdct16x16_c
0.54%     0.54%  vpxenc   vpxenc              [.] vpx_fdct16x16_neon

So, even though a _neon() version exists, the C version was called \
as well. After this patch:

1.42%     1.36%  vpxenc   vpxenc              [.] vpx_fdct4x4_neon
0.87%     0.82%  vpxenc   vpxenc              [.] vpx_fdct8x8_neon
0.74%     0.74%  vpxenc   vpxenc              [.] vpx_fdct16x16_neon

Change-Id: Id4e1dd315c67b4355fe4e5a1b59e181a349f16d0
---
 vpx_dsp/arm/fdct16x16_neon.c | 319 +---------------------------------
 vpx_dsp/arm/fdct16x16_neon.h | 327 +++++++++++++++++++++++++++++++++++
 vpx_dsp/arm/fdct_neon.c      |  61 ++-----
 vpx_dsp/arm/fdct_neon.h      | 213 +++++++++++++++++++++++
 vpx_dsp/arm/fwd_txfm_neon.c  | 212 ++++-------------------
 vpx_dsp/arm/transpose_neon.h |  39 +++++
 6 files changed, 629 insertions(+), 542 deletions(-)
 create mode 100644 vpx_dsp/arm/fdct16x16_neon.h
 create mode 100644 vpx_dsp/arm/fdct_neon.h

diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 6b2bebd097..67f43246aa 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
 
 // Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
 // functions.
@@ -27,316 +28,6 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
 #else
 
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
-  b[0] = vld1q_s16(a);
-  a += stride;
-  b[1] = vld1q_s16(a);
-  a += stride;
-  b[2] = vld1q_s16(a);
-  a += stride;
-  b[3] = vld1q_s16(a);
-  a += stride;
-  b[4] = vld1q_s16(a);
-  a += stride;
-  b[5] = vld1q_s16(a);
-  a += stride;
-  b[6] = vld1q_s16(a);
-  a += stride;
-  b[7] = vld1q_s16(a);
-  a += stride;
-  b[8] = vld1q_s16(a);
-  a += stride;
-  b[9] = vld1q_s16(a);
-  a += stride;
-  b[10] = vld1q_s16(a);
-  a += stride;
-  b[11] = vld1q_s16(a);
-  a += stride;
-  b[12] = vld1q_s16(a);
-  a += stride;
-  b[13] = vld1q_s16(a);
-  a += stride;
-  b[14] = vld1q_s16(a);
-  a += stride;
-  b[15] = vld1q_s16(a);
-}
-
-// Store 8 16x8 values, assuming stride == 16.
-static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
-  store_s16q_to_tran_low(a, b[0]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[1]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[2]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[3]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[4]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[5]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[6]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[7]);
-}
-
-// Load step of each pass. Add and subtract clear across the input, requiring
-// all 16 values to be loaded. For the first pass it also multiplies by 4.
-
-// To maybe reduce register usage this could be combined with the load() step to
-// get the first 4 and last 4 values, cross those, then load the middle 8 values
-// and cross them.
-static INLINE void cross_input(const int16x8_t *a /*[16]*/,
-                               int16x8_t *b /*[16]*/, const int pass) {
-  if (pass == 0) {
-    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
-    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
-    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
-    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
-    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
-    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
-    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
-    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
-
-    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
-    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
-    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
-    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
-    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
-    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
-    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
-    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
-  } else {
-    b[0] = vaddq_s16(a[0], a[15]);
-    b[1] = vaddq_s16(a[1], a[14]);
-    b[2] = vaddq_s16(a[2], a[13]);
-    b[3] = vaddq_s16(a[3], a[12]);
-    b[4] = vaddq_s16(a[4], a[11]);
-    b[5] = vaddq_s16(a[5], a[10]);
-    b[6] = vaddq_s16(a[6], a[9]);
-    b[7] = vaddq_s16(a[7], a[8]);
-
-    b[8] = vsubq_s16(a[7], a[8]);
-    b[9] = vsubq_s16(a[6], a[9]);
-    b[10] = vsubq_s16(a[5], a[10]);
-    b[11] = vsubq_s16(a[4], a[11]);
-    b[12] = vsubq_s16(a[3], a[12]);
-    b[13] = vsubq_s16(a[2], a[13]);
-    b[14] = vsubq_s16(a[1], a[14]);
-    b[15] = vsubq_s16(a[0], a[15]);
-  }
-}
-
-// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
-// because this only adds 1, not 1 << 2.
-static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
-  const int16x8_t one = vdupq_n_s16(1);
-  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
-  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
-  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
-  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
-  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
-  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
-  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
-  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
-  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
-  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
-  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
-  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
-  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
-  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
-  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
-  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
-}
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t c, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t c0,
-                                       const tran_coef_t c1, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
-                                 int16x8_t *b /*[8]*/) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
-// Main body of fdct16x16.
-static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
-  int16x8_t s[8];
-  int16x8_t x[4];
-  int16x8_t step[8];
-
-  // stage 1
-  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
-  // even_results);"
-  s[0] = vaddq_s16(in[0], in[7]);
-  s[1] = vaddq_s16(in[1], in[6]);
-  s[2] = vaddq_s16(in[2], in[5]);
-  s[3] = vaddq_s16(in[3], in[4]);
-  s[4] = vsubq_s16(in[3], in[4]);
-  s[5] = vsubq_s16(in[2], in[5]);
-  s[6] = vsubq_s16(in[1], in[6]);
-  s[7] = vsubq_s16(in[0], in[7]);
-
-  // fdct4(step, step);
-  x[0] = vaddq_s16(s[0], s[3]);
-  x[1] = vaddq_s16(s[1], s[2]);
-  x[2] = vsubq_s16(s[1], s[2]);
-  x[3] = vsubq_s16(s[0], s[3]);
-
-  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
-  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
-  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
-
-  //  Stage 2
-  // Re-using source s5/s6
-  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
-  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
-
-  //  Stage 3
-  x[0] = vaddq_s16(s[4], s[5]);
-  x[1] = vsubq_s16(s[4], s[5]);
-  x[2] = vsubq_s16(s[7], s[6]);
-  x[3] = vaddq_s16(s[7], s[6]);
-
-  // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
-
-  // step 2
-  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
-  // That file distinguished between "in_high" and "step1" but the only
-  // difference is that "in_high" is the first 8 values and "step 1" is the
-  // second. Here, since they are all in one array, "step1" values are += 8.
-
-  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
-  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
-  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
-  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
-
-  // step 3
-  s[0] = vaddq_s16(in[8], s[3]);
-  s[1] = vaddq_s16(in[9], s[2]);
-  x[0] = vsubq_s16(in[9], s[2]);
-  x[1] = vsubq_s16(in[8], s[3]);
-  x[2] = vsubq_s16(in[15], s[4]);
-  x[3] = vsubq_s16(in[14], s[5]);
-  s[6] = vaddq_s16(in[14], s[5]);
-  s[7] = vaddq_s16(in[15], s[4]);
-
-  // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
-
-  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
-
-  // step 5
-  step[0] = vaddq_s16(s[0], s[1]);
-  step[1] = vsubq_s16(s[0], s[1]);
-  step[2] = vaddq_s16(x[1], s[2]);
-  step[3] = vsubq_s16(x[1], s[2]);
-  step[4] = vsubq_s16(x[2], s[5]);
-  step[5] = vaddq_s16(x[2], s[5]);
-  step[6] = vsubq_s16(s[7], s[6]);
-  step[7] = vaddq_s16(s[7], s[6]);
-
-  // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
-  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
-                      &out[7]);
-  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
-                      &out[15]);
-  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
-                      &out[3]);
-  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
-                      &out[11]);
-}
-
 void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[16];
   int16x8_t temp1[16];
@@ -346,12 +37,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   // Left half.
   load(input, stride, temp0);
   cross_input(temp0, temp1, 0);
-  dct_body(temp1, temp0);
+  vpx_fdct16x16_body(temp1, temp0);
 
   // Right half.
   load(input + 8, stride, temp1);
   cross_input(temp1, temp2, 0);
-  dct_body(temp2, temp1);
+  vpx_fdct16x16_body(temp2, temp1);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
@@ -359,7 +50,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   transpose_8x8(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3, 1);
-  dct_body(temp3, temp2);
+  vpx_fdct16x16_body(temp3, temp2);
   transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
                     &temp2[5], &temp2[6], &temp2[7]);
   transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -375,7 +66,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
   cross_input(temp1, temp0, 1);
-  dct_body(temp0, temp1);
+  vpx_fdct16x16_body(temp0, temp1);
   transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
                     &temp1[5], &temp1[6], &temp1[7]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
new file mode 100644
index 0000000000..8391238991
--- /dev/null
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -0,0 +1,327 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+  b[0] = vld1q_s16(a);
+  a += stride;
+  b[1] = vld1q_s16(a);
+  a += stride;
+  b[2] = vld1q_s16(a);
+  a += stride;
+  b[3] = vld1q_s16(a);
+  a += stride;
+  b[4] = vld1q_s16(a);
+  a += stride;
+  b[5] = vld1q_s16(a);
+  a += stride;
+  b[6] = vld1q_s16(a);
+  a += stride;
+  b[7] = vld1q_s16(a);
+  a += stride;
+  b[8] = vld1q_s16(a);
+  a += stride;
+  b[9] = vld1q_s16(a);
+  a += stride;
+  b[10] = vld1q_s16(a);
+  a += stride;
+  b[11] = vld1q_s16(a);
+  a += stride;
+  b[12] = vld1q_s16(a);
+  a += stride;
+  b[13] = vld1q_s16(a);
+  a += stride;
+  b[14] = vld1q_s16(a);
+  a += stride;
+  b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+  store_s16q_to_tran_low(a, b[0]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[1]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[2]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[3]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[4]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[5]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[6]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/, const int pass) {
+  if (pass == 0) {
+    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
+    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
+    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
+    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
+    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
+    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
+    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
+    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
+
+    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
+    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
+    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
+    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
+    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
+    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
+    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
+    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
+  } else {
+    b[0] = vaddq_s16(a[0], a[15]);
+    b[1] = vaddq_s16(a[1], a[14]);
+    b[2] = vaddq_s16(a[2], a[13]);
+    b[3] = vaddq_s16(a[3], a[12]);
+    b[4] = vaddq_s16(a[4], a[11]);
+    b[5] = vaddq_s16(a[5], a[10]);
+    b[6] = vaddq_s16(a[6], a[9]);
+    b[7] = vaddq_s16(a[7], a[8]);
+
+    b[8] = vsubq_s16(a[7], a[8]);
+    b[9] = vsubq_s16(a[6], a[9]);
+    b[10] = vsubq_s16(a[5], a[10]);
+    b[11] = vsubq_s16(a[4], a[11]);
+    b[12] = vsubq_s16(a[3], a[12]);
+    b[13] = vsubq_s16(a[2], a[13]);
+    b[14] = vsubq_s16(a[1], a[14]);
+    b[15] = vsubq_s16(a[0], a[15]);
+  }
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+  const int16x8_t one = vdupq_n_s16(1);
+  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t c, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t c0,
+                                       const tran_coef_t c1, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
+                                 int16x8_t *b /*[8]*/) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
+// Main body of fdct16x16.
+static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
+                               int16x8_t *out /*[16]*/) {
+  int16x8_t s[8];
+  int16x8_t x[4];
+  int16x8_t step[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
+  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+  //  Stage 3
+  x[0] = vaddq_s16(s[4], s[5]);
+  x[1] = vsubq_s16(s[4], s[5]);
+  x[2] = vsubq_s16(s[7], s[6]);
+  x[3] = vaddq_s16(s[7], s[6]);
+
+  // Stage 4
+  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
+  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+  // step 3
+  s[0] = vaddq_s16(in[8], s[3]);
+  s[1] = vaddq_s16(in[9], s[2]);
+  x[0] = vsubq_s16(in[9], s[2]);
+  x[1] = vsubq_s16(in[8], s[3]);
+  x[2] = vsubq_s16(in[15], s[4]);
+  x[3] = vsubq_s16(in[14], s[5]);
+  s[6] = vaddq_s16(in[14], s[5]);
+  s[7] = vaddq_s16(in[15], s[4]);
+
+  // step 4
+  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
+  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+
+  // step 5
+  step[0] = vaddq_s16(s[0], s[1]);
+  step[1] = vsubq_s16(s[0], s[1]);
+  step[2] = vaddq_s16(x[1], s[2]);
+  step[3] = vsubq_s16(x[1], s[2]);
+  step[4] = vsubq_s16(x[2], s[5]);
+  step[5] = vaddq_s16(x[2], s[5]);
+  step[6] = vsubq_s16(s[7], s[6]);
+  step[7] = vaddq_s16(s[7], s[6]);
+
+  // step 6
+  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
+  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
+  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
+  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
+  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
+  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
+  // cospi_22_64)
+  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
+  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
+  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+                      &out[7]);
+  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+                      &out[15]);
+  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+                      &out[3]);
+  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+                      &out[11]);
+}
+
+#endif  // VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c
index 3708cbb11f..2827791f1e 100644
--- a/vpx_dsp/arm/fdct_neon.c
+++ b/vpx_dsp/arm/fdct_neon.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
@@ -22,67 +23,25 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
   int i;
   // input[M * stride] * 16
-  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
-  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
-  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
-  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+  int16x4_t in[4];
+  in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
 
   // If the very first value != 0, then add 1.
   if (input[0] != 0) {
     const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
-    input_0 = vadd_s16(input_0, one);
+    in[0] = vadd_s16(in[0], one);
   }
-
   for (i = 0; i < 2; ++i) {
-    const int16x8_t input_01 = vcombine_s16(input_0, input_1);
-    const int16x8_t input_32 = vcombine_s16(input_3, input_2);
-
-    // in_0 +/- in_3, in_1 +/- in_2
-    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
-    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
-    // step_0 +/- step_1, step_2 +/- step_3
-    const int16x4_t s_0 = vget_low_s16(s_01);
-    const int16x4_t s_1 = vget_high_s16(s_01);
-    const int16x4_t s_2 = vget_high_s16(s_32);
-    const int16x4_t s_3 = vget_low_s16(s_32);
-
-    // (s_0 +/- s_1) * cospi_16_64
-    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
-    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
-    // fdct_round_shift
-    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
-    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
-    // s_3 * cospi_8_64 + s_2 * cospi_24_64
-    // s_3 * cospi_24_64 - s_2 * cospi_8_64
-    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
-    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
-    // fdct_round_shift
-    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
-    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
-    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
-    input_0 = out_0;
-    input_1 = out_1;
-    input_2 = out_2;
-    input_3 = out_3;
+    vpx_fdct4x4_pass1_neon(in);
   }
-
   {
     // Not quite a rounding shift. Only add 1 despite shifting by 2.
     const int16x8_t one = vdupq_n_s16(1);
-    int16x8_t out_01 = vcombine_s16(input_0, input_1);
-    int16x8_t out_23 = vcombine_s16(input_2, input_3);
+    int16x8_t out_01 = vcombine_s16(in[0], in[1]);
+    int16x8_t out_23 = vcombine_s16(in[2], in[3]);
     out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
     out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
     store_s16q_to_tran_low(final_output + 0 * 8, out_01);
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
new file mode 100644
index 0000000000..28d7d86bf8
--- /dev/null
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // (s_0 +/- s_1) * cospi_16_64
+  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+  const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+  const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+  const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+  const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+
+  // fdct_round_shift
+  int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+  int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+  const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+
+  const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+  const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+
+  // fdct_round_shift
+  int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+  int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+  in[0] = out_0;
+  in[1] = out_1;
+  in[2] = out_2;
+  in[3] = out_3;
+}
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
+  const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
+  const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
+  const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
+  const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
+  const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
+  const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
+  const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+  int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+  int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+  int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+  // fdct4(step, step);
+  int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+  int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+  int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+  int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+  int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
+  int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
+  int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
+  int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
+  v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
+  v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
+  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
+  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
+  v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+  v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+  v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+  v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+    out[0] = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+    out[2] = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+    out[4] = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+    out[6] = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+  }
+  // Stage 2
+  v_x0 = vsubq_s16(v_s6, v_s5);
+  v_x1 = vaddq_s16(v_s6, v_s5);
+  v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
+  v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
+  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
+  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x8_t ab = vcombine_s16(a, b);
+    const int16x8_t cd = vcombine_s16(c, d);
+    // Stage 3
+    v_x0 = vaddq_s16(v_s4, ab);
+    v_x1 = vsubq_s16(v_s4, ab);
+    v_x2 = vsubq_s16(v_s7, cd);
+    v_x3 = vaddq_s16(v_s7, cd);
+  }
+  // Stage 4
+  v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
+  v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
+  v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
+  v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
+  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
+  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
+  v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
+  v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
+  v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
+  v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
+  v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
+  v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
+  v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
+  v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
+  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
+  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+    out[1] = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+    out[3] = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+    out[5] = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+    out[7] = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+  }
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass1_notranspose_neon(in, out);
+  // transpose 8x8
+  // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
+  // columns.
+  {
+    // 00 01 02 03 40 41 42 43
+    // 10 11 12 13 50 51 52 53
+    // 20 21 22 23 60 61 62 63
+    // 30 31 32 33 70 71 72 73
+    // 04 05 06 07 44 45 46 47
+    // 14 15 16 17 54 55 56 57
+    // 24 25 26 27 64 65 66 67
+    // 34 35 36 37 74 75 76 77
+    const int32x4x2_t r02_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
+    const int32x4x2_t r13_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
+    const int32x4x2_t r46_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
+    const int32x4x2_t r57_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
+    const int16x8x2_t r01_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                  vreinterpretq_s16_s32(r13_s32.val[0]));
+    const int16x8x2_t r23_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                  vreinterpretq_s16_s32(r13_s32.val[1]));
+    const int16x8x2_t r45_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                  vreinterpretq_s16_s32(r57_s32.val[0]));
+    const int16x8x2_t r67_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                  vreinterpretq_s16_s32(r57_s32.val[1]));
+    in[0] = r01_s16.val[0];
+    in[1] = r01_s16.val[1];
+    in[2] = r23_s16.val[0];
+    in[3] = r23_s16.val[1];
+    in[4] = r45_s16.val[0];
+    in[5] = r45_s16.val[1];
+    in[6] = r67_s16.val[0];
+    in[7] = r67_s16.val[1];
+    // 00 10 20 30 40 50 60 70
+    // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72
+    // 03 13 23 33 43 53 63 73
+    // 04 14 24 34 44 54 64 74
+    // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76
+    // 07 17 27 37 47 57 67 77
+  }
+}
+#endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
index 374a262b93..d9161c6d38 100644
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -15,196 +15,54 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
   int i;
   // stage 1
-  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  int16x8_t in[8];
+  in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
   for (i = 0; i < 2; ++i) {
-    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
-    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
-    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
-    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
-    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
-    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
-    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
-    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
-    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
-    // fdct4(step, step);
-    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-    // fdct4(step, step);
-    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-    }
-    // Stage 2
-    v_x0 = vsubq_s16(v_s6, v_s5);
-    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x8_t ab = vcombine_s16(a, b);
-      const int16x8_t cd = vcombine_s16(c, d);
-      // Stage 3
-      v_x0 = vaddq_s16(v_s4, ab);
-      v_x1 = vsubq_s16(v_s4, ab);
-      v_x2 = vsubq_s16(v_s7, cd);
-      v_x3 = vaddq_s16(v_s7, cd);
-    }
-    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-    }
-    // transpose 8x8
-    // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
-    // columns.
-    {
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      // 04 05 06 07 44 45 46 47
-      // 14 15 16 17 54 55 56 57
-      // 24 25 26 27 64 65 66 67
-      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
-      const int16x8x2_t r01_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                    vreinterpretq_s16_s32(r13_s32.val[0]));
-      const int16x8x2_t r23_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                    vreinterpretq_s16_s32(r13_s32.val[1]));
-      const int16x8x2_t r45_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                    vreinterpretq_s16_s32(r57_s32.val[0]));
-      const int16x8x2_t r67_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                    vreinterpretq_s16_s32(r57_s32.val[1]));
-      input_0 = r01_s16.val[0];
-      input_1 = r01_s16.val[1];
-      input_2 = r23_s16.val[0];
-      input_3 = r23_s16.val[1];
-      input_4 = r45_s16.val[0];
-      input_5 = r45_s16.val[1];
-      input_6 = r67_s16.val[0];
-      input_7 = r67_s16.val[1];
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
+    vpx_fdct8x8_pass1_neon(in);
   }  // for
   {
     // from vpx_dct_sse2.c
     // Post-condition (division by two)
     //    division of two 16 bits signed numbers using shifts
     //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
-    input_0 = vhsubq_s16(input_0, sign_in0);
-    input_1 = vhsubq_s16(input_1, sign_in1);
-    input_2 = vhsubq_s16(input_2, sign_in2);
-    input_3 = vhsubq_s16(input_3, sign_in3);
-    input_4 = vhsubq_s16(input_4, sign_in4);
-    input_5 = vhsubq_s16(input_5, sign_in5);
-    input_6 = vhsubq_s16(input_6, sign_in6);
-    input_7 = vhsubq_s16(input_7, sign_in7);
+    const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+    in[0] = vhsubq_s16(in[0], sign_in0);
+    in[1] = vhsubq_s16(in[1], sign_in1);
+    in[2] = vhsubq_s16(in[2], sign_in2);
+    in[3] = vhsubq_s16(in[3], sign_in3);
+    in[4] = vhsubq_s16(in[4], sign_in4);
+    in[5] = vhsubq_s16(in[5], sign_in5);
+    in[6] = vhsubq_s16(in[6], sign_in6);
+    in[7] = vhsubq_s16(in[7], sign_in7);
     // store results
-    store_s16q_to_tran_low(final_output + 0 * 8, input_0);
-    store_s16q_to_tran_low(final_output + 1 * 8, input_1);
-    store_s16q_to_tran_low(final_output + 2 * 8, input_2);
-    store_s16q_to_tran_low(final_output + 3 * 8, input_3);
-    store_s16q_to_tran_low(final_output + 4 * 8, input_4);
-    store_s16q_to_tran_low(final_output + 5 * 8, input_5);
-    store_s16q_to_tran_low(final_output + 6 * 8, input_6);
-    store_s16q_to_tran_low(final_output + 7 * 8, input_7);
+    store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+    store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+    store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+    store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+    store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+    store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+    store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+    store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
   }
 }
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 752308160d..c098ad31b6 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -1184,6 +1184,45 @@ static INLINE void transpose_u8_16x16(
   *o15 = e7.val[1];
 }
 
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+  int16x8_t t[8];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  t[0] = in0[8];
+  t[1] = in0[9];
+  t[2] = in0[10];
+  t[3] = in0[11];
+  t[4] = in0[12];
+  t[5] = in0[13];
+  t[6] = in0[14];
+  t[7] = in0[15];
+  in0[8] = in1[0];
+  in0[9] = in1[1];
+  in0[10] = in1[2];
+  in0[11] = in1[3];
+  in0[12] = in1[4];
+  in0[13] = in1[5];
+  in0[14] = in1[6];
+  in0[15] = in1[7];
+  in1[0] = t[0];
+  in1[1] = t[1];
+  in1[2] = t[2];
+  in1[3] = t[3];
+  in1[4] = t[4];
+  in1[5] = t[5];
+  in1[6] = t[6];
+  in1[7] = t[7];
+
+  transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+                    &in0[6], &in0[7]);
+  transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+                    &in0[14], &in0[15]);
+  transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+                    &in1[6], &in1[7]);
+  transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+                    &in1[14], &in1[15]);
+}
+
 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
                                              const int a_stride, uint8x8_t *a0,
                                              uint8x8_t *a1, uint8x8_t *a2,

From f3711cae5a4cf5d673f7f27ea3a5975ab3f2982a Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 22 Mar 2022 13:07:31 -0700
Subject: [PATCH 223/926] Fix ClangTidy style warning

Change-Id: I6c4711e488cda6b97af96d5e1b6b249786e709de
---
 test/svc_end_to_end_test.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index c0556d8b7d..7300ce6679 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -251,12 +251,13 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
       temporal_layer_id_ = layer_id.temporal_layer_id;
       for (int i = 0; i < number_spatial_layers_; i++) {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
-        ref_frame_config.duration[i] = 1;
+        ref_frame_config_.duration[i] = 1;
       }
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config);
-      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+                                  number_spatial_layers_, 0,
+                                  &ref_frame_config_);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
     }
     if (video->frame() == frame_to_sync_) {
       encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_);
@@ -327,7 +328,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   unsigned int mismatch_nframes_;
   unsigned int num_nonref_frames_;
   bool flexible_mode_;
-  vpx_svc_ref_frame_config_t ref_frame_config;
+  vpx_svc_ref_frame_config_t ref_frame_config_;
 
  private:
   virtual void SetConfig(const int num_temporal_layer) {

From f6344745d9887adb38a62a878a3b794b84240851 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 22 Mar 2022 13:51:27 -0700
Subject: [PATCH 224/926] ads2gas*.pl: strip trailing whitespace after
 transforms

Change-Id: I0bea977b256e464231706c72cc14a5c8b6e90775
---
 build/make/ads2gas.pl       | 1 +
 build/make/ads2gas_apple.pl | 1 +
 2 files changed, 2 insertions(+)

diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index 4b7a906d26..c17c4114a2 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -147,6 +147,7 @@
 
     s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
+    s/[ \t]+$//;
     print;
 }
 
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index af10b436a9..34254f4abe 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -109,5 +109,6 @@ ($)
 
     s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
+    s/[ \t]+$//;
     print;
 }

From da0cfd3d592713a66863484f1c19aba2b775306f Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Wed, 23 Mar 2022 13:58:46 +0900
Subject: [PATCH 225/926] ads2gas: fix .size measurement

The distance between PROC and END is used to generate .size
information for debugging. When the leading underscore was
removed the pattern used to match the function name broke.

Change-Id: I90bf67d95ecdc2d214606e663773f88d2a2d6b9c
---
 build/make/ads2gas.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index c17c4114a2..fcbd59b89b 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -118,7 +118,8 @@
     # This makes them show up properly in debugging tools like gdb and valgrind.
     if (/\bPROC\b/) {
         my $proc;
-        /^_([\.0-9A-Z_a-z]\w+)\b/;
+        # Match the function name so it can be stored in $proc
+        /^([\.0-9A-Z_a-z]\w+)\b/;
         $proc = $1;
         push(@proc_stack, $proc) if ($proc);
         s/\bPROC\b/@ $&/;

From 29cde7ec1a1d466e30c33b939616a4903a8abb3f Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Wed, 23 Mar 2022 14:18:58 +0900
Subject: [PATCH 226/926] ads2gas: maintain whitespace

Don't use tabs during conversion. Save and restore
existing spacing.

Change-Id: Ib8f443db542c091d36e9ab9836e3e3e292d711f7
---
 build/make/ads2gas.pl       | 31 ++++++++++++++++---------------
 build/make/ads2gas_apple.pl |  8 ++++----
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index fcbd59b89b..c301b7f829 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -32,7 +32,7 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
-print "\t.syntax unified\n";
+print ".syntax unified\n";
 if ($thumb) {
     print "\t.thumb\n";
 }
@@ -60,19 +60,19 @@
     }
 
     # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
+    s/INCLUDE\s?(.*)$/.include \"$1\"/;
 
     # No AREA required
     # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/;
     # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
+    s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/;
 
     # Make function visible to linker.
     if ($elf) {
-        s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
+        s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2\n$1.type $2, function/;
     } else {
-        s/EXPORT\s+\|([\$\w]*)\|/.global $1/;
+        s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2/;
     }
 
     # No vertical bars on function names
@@ -85,11 +85,12 @@
     s/\bALIGN\b/.balign/g;
 
     if ($thumb) {
-        # ARM code - we force everything to thumb with the declaration in the header
-        s/\s+ARM//g;
+        # ARM code - we force everything to thumb with the declaration in the
+        # header
+        s/\bARM\b//g;
     } else {
         # ARM code
-        s/\sARM/.arm/g;
+        s/\bARM\b/.arm/g;
     }
 
     # push/pop
@@ -105,13 +106,13 @@
 
     if ($elf) {
         # REQUIRE8 Stack is required to be 8-byte aligned
-        s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
+        s/\bREQUIRE8\b/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
 
         # PRESERVE8 Stack 8-byte align is preserved
-        s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+        s/\bPRESERVE8\b/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
     } else {
-        s/\s+REQUIRE8//;
-        s/\s+PRESERVE8//;
+        s/\bREQUIRE8\b//;
+        s/\bPRESERVE8\b//;
     }
 
     # Use PROC and ENDP to give the symbols a .size directive.
@@ -129,7 +130,7 @@
         my $proc;
         s/\bENDP\b/@ $&/;
         $proc = pop(@proc_stack);
-        $_ = "\t.size $proc, .-$proc".$_ if ($proc and $elf);
+        $_ = ".size $proc, .-$proc".$_ if ($proc and $elf);
     }
 
     # EQU directive
@@ -153,4 +154,4 @@
 }
 
 # Mark that this object doesn't need an executable stack.
-printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n") if $elf;
+printf ("    .section .note.GNU-stack,\"\",\%\%progbits\n") if $elf;
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 34254f4abe..62491c1918 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -20,7 +20,7 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
-print "\t.syntax unified\n";
+print ".syntax unified\n";
 
 my %macro_aliases;
 
@@ -57,13 +57,13 @@ ($)
     }
 
     # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
+    s/INCLUDE\s?(.*)$/.include \"$1\"/;
 
     # No AREA required
     # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/;
     # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
+    s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/;
 
     # Make function visible to linker.
     s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/;

From 9c424b7556ee44df2335332e079c59f0a8d3559b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 26 Mar 2022 10:25:18 -0700
Subject: [PATCH 227/926] ads2armasm_ms.pl: fix thumb::FixThumbInstructions
 call

broken since:
642529248 ads2gas[_apple].pl: remove unused stanzas

Change-Id: I1eac77e2fe23cc3f162251e9e0102a4909f7b997
---
 build/make/ads2armasm_ms.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/make/ads2armasm_ms.pl b/build/make/ads2armasm_ms.pl
index 2a2c470ff8..dd4e0318c4 100755
--- a/build/make/ads2armasm_ms.pl
+++ b/build/make/ads2armasm_ms.pl
@@ -28,7 +28,7 @@
     s/qsubaddx/qsax/i;
     s/qaddsubx/qasx/i;
 
-    thumb::FixThumbInstructions($_, 1);
+    thumb::FixThumbInstructions($_);
 
     s/ldrneb/ldrbne/i;
     s/ldrneh/ldrhne/i;

From d60b671a73a4c8ebc2324ccb248a713652d6506b Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Wed, 23 Mar 2022 14:28:29 +0900
Subject: [PATCH 228/926] gcc 11 warning: mismatched bound
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clean up a new build warning with gcc11:
argument 3 of type ‘const uint8_t * const[]’ with
mismatched bound [-Warray-parameter=]

Standardize sad functions with array sizes.

Change-Id: Iea4144e61368f6a8279e2f3ae96c78aff06c8b41
---
 vpx_dsp/arm/sad4d_neon.c     | 122 +++++++++++++++++------------------
 vpx_dsp/mips/sad_msa.c       |  32 ++++-----
 vpx_dsp/sad.c                |  48 +++++++++-----
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  74 ++++++++++-----------
 vpx_dsp/x86/sad4d_avx2.c     |  13 ++--
 vpx_dsp/x86/sad4d_avx512.c   |   6 +-
 6 files changed, 155 insertions(+), 140 deletions(-)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 5c7a0fcaf0..03f716c3d5 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -31,7 +31,7 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
 static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
                             const uint8_t *const ref_array[4],
                             const int ref_stride, const int height,
-                            uint32_t *const res) {
+                            uint32_t sad_array[4]) {
   int i;
   uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
 #if !defined(__aarch64__)
@@ -61,26 +61,26 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
   a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
   r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
 #endif
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 }
 
 void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
+                        uint32_t sad_array[4]) {
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array);
 }
 
 void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
+                        uint32_t sad_array[4]) {
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
 // Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
-static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                          uint32_t *const res) {
+static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4],
+                                          uint32_t sad_array[4]) {
 #if defined(__aarch64__)
   const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
   const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
@@ -95,21 +95,21 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
   const uint16x4_t b1 = vpadd_u16(a2, a3);
   const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
 #endif
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 }
 
 #if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
 
 // Can handle 1024 pixels' sad sum (such as 32x32)
-static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                           uint32_t *const res) {
+static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4],
+                                           uint32_t sad_array[4]) {
 #if defined(__aarch64__)
   const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
   const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
   const uint32x4_t b0 = vpaddlq_u16(a0);
   const uint32x4_t b1 = vpaddlq_u16(a1);
   const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 #else
   const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
   const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
@@ -119,13 +119,13 @@ static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
   const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
   const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
   const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  vst1q_u32(res, vcombine_u32(c0, c1));
+  vst1q_u32(sad_array, vcombine_u32(c0, c1));
 #endif
 }
 
 // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
-static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                           uint32_t *const res) {
+static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4],
+                                           uint32_t sad_array[4]) {
 #if defined(__aarch64__)
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -134,7 +134,7 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
   const uint32x4_t b0 = vpaddq_u32(a0, a1);
   const uint32x4_t b1 = vpaddq_u32(a2, a3);
   const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 #else
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -146,13 +146,13 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
   const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
   const uint32x2_t c0 = vpadd_u32(b0, b1);
   const uint32x2_t c1 = vpadd_u32(b2, b3);
-  vst1q_u32(res, vcombine_u32(c0, c1));
+  vst1q_u32(sad_array, vcombine_u32(c0, c1));
 #endif
 }
 
 // Can handle 4096 pixels' sad sum (such as 64x64)
-static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
-                                           uint32_t *const res) {
+static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8],
+                                           uint32_t sad_array[4]) {
 #if defined(__aarch64__)
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -169,7 +169,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
   const uint32x4_t c0 = vpaddq_u32(b0, b1);
   const uint32x4_t c1 = vpaddq_u32(b2, b3);
   const uint32x4_t r = vpaddq_u32(c0, c1);
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 #else
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -189,7 +189,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
   const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
   const uint32x2_t d0 = vpadd_u32(c0, c1);
   const uint32x2_t d1 = vpadd_u32(c2, c3);
-  vst1q_u32(res, vcombine_u32(d0, d1));
+  vst1q_u32(sad_array, vcombine_u32(d0, d1));
 #endif
 }
 
@@ -197,7 +197,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
 
 static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
                             const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t *res, const int height) {
+                            uint32_t sad_array[4], const int height) {
   int i, j;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
@@ -214,25 +214,25 @@ static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
     }
   }
 
-  sad_512_pel_final_neon(sum, res);
+  sad_512_pel_final_neon(sum, sad_array);
 }
 
 void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
+                        uint32_t sad_array[4]) {
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4);
 }
 
 void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
+                        uint32_t sad_array[4]) {
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
 }
 
 void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+                         uint32_t sad_array[4]) {
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -249,7 +249,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
 
 static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t *res, const int height) {
+                             uint32_t sad_array[4], const int height) {
   int i;
   uint32x4_t r0, r1;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -267,7 +267,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
 
   r0 = vpaddq_u32(sum[0], sum[1]);
   r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(res, vpaddq_u32(r0, r1));
+  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
 #else
@@ -281,7 +281,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
 
 static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t *res, const int height) {
+                             uint32_t sad_array[4], const int height) {
   int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
@@ -302,27 +302,27 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
     ref_loop[3] += ref_stride;
   }
 
-  sad_512_pel_final_neon(sum, res);
+  sad_512_pel_final_neon(sum, sad_array);
 }
 
 #endif
 
 void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
+                         uint32_t sad_array[4]) {
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
 }
 
 void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+                          uint32_t sad_array[4]) {
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
 }
 
 void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+                          uint32_t sad_array[4]) {
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -332,7 +332,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
 
 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t *res, const int height) {
+                             uint32_t sad_array[4], const int height) {
   int i;
   uint32x4_t r0, r1;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -365,25 +365,25 @@ static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
 
   r0 = vpaddq_u32(sum[0], sum[1]);
   r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(res, vpaddq_u32(r0, r1));
+  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
 void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+                          uint32_t sad_array[4]) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
 }
 
 void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+                          uint32_t sad_array[4]) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
 }
 
 void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 64);
+                          uint32_t sad_array[4]) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
 }
 
 #else
@@ -422,26 +422,26 @@ static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
 
 void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   uint16x8_t sum[4];
   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
-  sad_512_pel_final_neon(sum, res);
+  sad_512_pel_final_neon(sum, sad_array);
 }
 
 void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   uint16x8_t sum[4];
   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
-  sad_1024_pel_final_neon(sum, res);
+  sad_1024_pel_final_neon(sum, sad_array);
 }
 
 void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   uint16x8_t sum[4];
   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
-  sad_2048_pel_final_neon(sum, res);
+  sad_2048_pel_final_neon(sum, sad_array);
 }
 
 #endif
@@ -453,7 +453,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
 
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   int i;
   uint32x4_t r0, r1;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -497,12 +497,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
 
   r0 = vpaddq_u32(sum[0], sum[1]);
   r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(res, vpaddq_u32(r0, r1));
+  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
 void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   int i;
   uint32x4_t r0, r1, r2, r3;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -551,14 +551,14 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   r3 = vpaddq_u32(sum[6], sum[7]);
   r0 = vpaddq_u32(r0, r1);
   r1 = vpaddq_u32(r2, r3);
-  vst1q_u32(res, vpaddq_u32(r0, r1));
+  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
 #else
 
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
@@ -599,12 +599,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
     ref_loop[3] += ref_stride;
   }
 
-  sad_2048_pel_final_neon(sum, res);
+  sad_2048_pel_final_neon(sum, sad_array);
 }
 
 void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
@@ -646,7 +646,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
     ref_loop[3] += ref_stride;
   }
 
-  sad_4096_pel_final_neon(sum, res);
+  sad_4096_pel_final_neon(sum, sad_array);
 }
 
 #endif
diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c
index ab681ae9f8..e3e91c4330 100644
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c
@@ -1040,77 +1040,77 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 #define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
   void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
                                  const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
+                                 uint32_t sads[3]) {                     \
     sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
 #define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
   void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
                                  const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
+                                 uint32_t sads[3]) {                     \
     sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
 #define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
   void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
                                   const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
+                                  uint32_t sads[3]) {                     \
     sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
 #define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
   void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
                                  const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
+                                 uint32_t sads[8]) {                     \
     sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
 #define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
   void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
                                  const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
+                                 uint32_t sads[8]) {                     \
     sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
 #define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
   void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
                                   const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
+                                  uint32_t sads[8]) {                     \
     sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 769322019e..46d513b686 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -45,23 +45,39 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
   }
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
+// Compare |src_ptr| to |k| adjacent blocks starting at |ref_ptr|.
+// |k| == {3,8}. Used in vp8 for an exhaustive search.
+// src:           ref:
+//  0  1  2  3     0  1  2  3  x  x
+//  4  5  6  7     6  7  8  9  x  x
+//  8  9 10 11    12 13 14 15  x  x
+// 12 13 14 15    18 19 20 21  x  x
+//
+//                 x  1  2  3  4  x
+//                 x  7  8  9 10  x
+//                 x 13 14 15 16  x
+//                 x 19 20 21 22  x
+//
+//                 x  x  2  3  4  5
+//                 x  x  8  9 10 11
+//                 x  x 14 15 16 17
+//                 x  x 20 21 22 23
+//
 #define sadMxNxK(m, n, k)                                                     \
   void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride,     \
                                   const uint8_t *ref_ptr, int ref_stride,     \
-                                  uint32_t *sad_array) {                      \
+                                  uint32_t sad_array[k]) {                    \
     int i;                                                                    \
     for (i = 0; i < k; ++i)                                                   \
       sad_array[i] =                                                          \
-          vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \
+          vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_ptr + i, ref_stride); \
   }
 
-// This appears to be equivalent to the above when k == 4 and refs is const
+// Compare |src_ptr| to 4 distinct references in |ref_array[]|
 #define sadMxNx4D(m, n)                                                        \
   void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \
-                               const uint8_t *const ref_array[],               \
-                               int ref_stride, uint32_t *sad_array) {          \
+                               const uint8_t *const ref_array[4],              \
+                               int ref_stride, uint32_t sad_array[4]) {        \
     int i;                                                                     \
     for (i = 0; i < 4; ++i)                                                    \
       sad_array[i] =                                                           \
@@ -181,15 +197,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
     return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
   }
 
-#define highbd_sadMxNx4D(m, n)                                                \
-  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
-                                      const uint8_t *const ref_array[],       \
-                                      int ref_stride, uint32_t *sad_array) {  \
-    int i;                                                                    \
-    for (i = 0; i < 4; ++i) {                                                 \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,         \
-                                                 ref_array[i], ref_stride);   \
-    }                                                                         \
+#define highbd_sadMxNx4D(m, n)                                                 \
+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,  \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,          \
+                                                 ref_array[i], ref_stride);    \
+    }                                                                          \
   }
 
 /* clang-format off */
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8b66722481..06a8febb29 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -877,80 +877,80 @@ ()
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
 # Blocks of 3
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
 specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
 
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
 specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
 
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
 specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
 
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
 specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
 
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
 specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
 
 # Blocks of 8
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
 specialize qw/vpx_sad32x32x8 avx2/;
 
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
 specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
 
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
 specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
 
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
 specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
 
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
 specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
 
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
 specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
@@ -1064,43 +1064,43 @@ ()
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad64x64x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad64x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x64x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x4x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x4x4d sse2/;
 
   #
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 9dd0666918..5f1f757e25 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -11,8 +11,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
-                                uint32_t *sad_array) {
+static INLINE void calc_final_4(const __m256i sums[4], uint32_t sad_array[4]) {
   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
   const __m256i t2 = _mm256_hadd_epi32(t0, t1);
@@ -22,8 +21,8 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
 }
 
 void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[/*4*/], int ref_stride,
-                          uint32_t *sad_array /*[4]*/) {
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t sad_array[4]) {
   int i;
   const uint8_t *refs[4];
   __m256i sums[4];
@@ -71,7 +70,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
 
 void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride,
-                         uint32_t *sad_array) {
+                         uint32_t sad_array[8]) {
   int i;
   __m256i sums[8];
 
@@ -127,8 +126,8 @@ void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
 }
 
 void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[/*4*/], int ref_stride,
-                          uint32_t *sad_array /*[4]*/) {
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t sad_array[4]) {
   __m256i sums[4];
   int i;
   const uint8_t *refs[4];
diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c
index 2fa9108718..cfd23fedd9 100644
--- a/vpx_dsp/x86/sad4d_avx512.c
+++ b/vpx_dsp/x86/sad4d_avx512.c
@@ -12,8 +12,8 @@
 #include "vpx/vpx_integer.h"
 
 void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *const ref_array[/*4*/],
-                            int ref_stride, uint32_t *res /*[4]*/) {
+                            const uint8_t *const ref_array[4], int ref_stride,
+                            uint32_t sad_array[4]) {
   __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   __m512i sum_mlow, sum_mhigh;
@@ -78,6 +78,6 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
     sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
                            _mm256_extractf128_si256(sum256, 1));
 
-    _mm_storeu_si128((__m128i *)(res), sum128);
+    _mm_storeu_si128((__m128i *)(sad_array), sum128);
   }
 }

From afd60bd07d41e5d20a0b11eeeb104846d9517c65 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Mon, 28 Mar 2022 15:27:46 +0900
Subject: [PATCH 229/926] remove sad x3,x8 specializations

These would compute the sum of absolute differences (sad) for a
group of 3 or 8 references. This was used as part of an exhaustive
search.

vp8 only uses these functions in speed 0 and best quality.

For vp9 this is only used with the --enable-non-greedy-mv
experiment.

This removes the 3- and 8-at-a-time optimized functions and uses
the fall back code which will process 1 or 4 (vpx_sadMxNx4d) at
a time.

For configure --target=x86_64-linux-gcc --enable-realtime-only:
libvpx.a
before: 3002424 after: 2937622 delta: 64802
after 'strip libvpx.a'
before: 2116998 after: 2073090 delta: 43908

Change-Id: I566d06e027c327b3bede68649dd551bba81a848e
---
 test/sad_test.cc             |  68 ------
 vp8/common/rtcd_defs.pl      |   5 -
 vp8/encoder/mcomp.c          | 276 +----------------------
 vp8/encoder/mcomp.h          |   8 +-
 vp8/encoder/onyx_if.c        |  11 -
 vp8/encoder/onyx_int.h       |   1 -
 vp8/encoder/rdopt.c          |   4 +-
 vp9/encoder/vp9_encoder.c    |  50 ++--
 vp9/encoder/vp9_mcomp.c      |  23 --
 vp9/encoder/vp9_mcomp.h      |  10 -
 vpx_dsp/mips/sad_mmi.c       |  23 --
 vpx_dsp/mips/sad_msa.c       | 426 -----------------------------------
 vpx_dsp/sad.c                |  41 +---
 vpx_dsp/variance.h           |   3 -
 vpx_dsp/vpx_dsp.mk           |   3 -
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  38 ----
 vpx_dsp/x86/sad4d_avx2.c     |  57 -----
 vpx_dsp/x86/sad_sse3.asm     | 376 -------------------------------
 vpx_dsp/x86/sad_sse4.asm     | 361 -----------------------------
 vpx_dsp/x86/sad_ssse3.asm    | 372 ------------------------------
 20 files changed, 33 insertions(+), 2123 deletions(-)
 delete mode 100644 vpx_dsp/x86/sad_sse3.asm
 delete mode 100644 vpx_dsp/x86/sad_sse4.asm
 delete mode 100644 vpx_dsp/x86/sad_ssse3.asm

diff --git a/test/sad_test.cc b/test/sad_test.cc
index ee10a46389..560c5f3823 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -56,8 +56,6 @@ typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sad_array);
 
-typedef TestParams<SadMxNx8Func> SadMxNx8Param;
-
 using libvpx_test::ACMRandom;
 
 namespace {
@@ -266,30 +264,6 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
   ParamType params_;
 };
 
-class SADx8Test : public SADTestBase<SadMxNx8Param> {
- public:
-  SADx8Test() : SADTestBase(GetParam()) {}
-
- protected:
-  void SADs(unsigned int *results) const {
-    const uint8_t *reference = GetReferenceFromOffset(0);
-
-    ASM_REGISTER_STATE_CHECK(params_.func(
-        source_data_, source_stride_, reference, reference_stride_, results));
-  }
-
-  void CheckSADs() const {
-    uint32_t reference_sad;
-    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[8]);
-
-    SADs(exp_sad);
-    for (int offset = 0; offset < 8; ++offset) {
-      reference_sad = ReferenceSAD(offset);
-      EXPECT_EQ(reference_sad, exp_sad[offset]) << "offset " << offset;
-    }
-  }
-};
-
 class SADx4Test : public SADTestBase<SadMxNx4Param> {
  public:
   SADx4Test() : SADTestBase(GetParam()) {}
@@ -564,13 +538,6 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(SADx8Test, Regular) {
-  FillRandomWH(source_data_, source_stride_, params_.width, params_.height);
-  FillRandomWH(GetReferenceFromOffset(0), reference_stride_, params_.width + 8,
-               params_.height);
-  CheckSADs();
-}
-
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
@@ -747,24 +714,6 @@ const SadMxNx4Param x4d_c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
-// TODO(angiebird): implement the marked-down sad functions
-const SadMxNx8Param x8_c_tests[] = {
-  // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c),
-  // SadMxNx8Param(64, 32, &vpx_sad64x32x8_c),
-  // SadMxNx8Param(32, 64, &vpx_sad32x64x8_c),
-  SadMxNx8Param(32, 32, &vpx_sad32x32x8_c),
-  // SadMxNx8Param(32, 16, &vpx_sad32x16x8_c),
-  // SadMxNx8Param(16, 32, &vpx_sad16x32x8_c),
-  SadMxNx8Param(16, 16, &vpx_sad16x16x8_c),
-  SadMxNx8Param(16, 8, &vpx_sad16x8x8_c),
-  SadMxNx8Param(8, 16, &vpx_sad8x16x8_c),
-  SadMxNx8Param(8, 8, &vpx_sad8x8x8_c),
-  // SadMxNx8Param(8, 4, &vpx_sad8x4x8_c),
-  // SadMxNx8Param(4, 8, &vpx_sad4x8x8_c),
-  SadMxNx8Param(4, 4, &vpx_sad4x4x8_c),
-};
-INSTANTIATE_TEST_SUITE_P(C, SADx8Test, ::testing::ValuesIn(x8_c_tests));
-
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_NEON
@@ -992,18 +941,6 @@ INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 // Only functions are x3, which do not have tests.
 #endif  // HAVE_SSSE3
 
-#if HAVE_SSE4_1
-const SadMxNx8Param x8_sse4_1_tests[] = {
-  SadMxNx8Param(16, 16, &vpx_sad16x16x8_sse4_1),
-  SadMxNx8Param(16, 8, &vpx_sad16x8x8_sse4_1),
-  SadMxNx8Param(8, 16, &vpx_sad8x16x8_sse4_1),
-  SadMxNx8Param(8, 8, &vpx_sad8x8x8_sse4_1),
-  SadMxNx8Param(4, 4, &vpx_sad4x4x8_sse4_1),
-};
-INSTANTIATE_TEST_SUITE_P(SSE4_1, SADx8Test,
-                         ::testing::ValuesIn(x8_sse4_1_tests));
-#endif  // HAVE_SSE4_1
-
 #if HAVE_AVX2
 const SadMxNParam avx2_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_avx2),
@@ -1029,11 +966,6 @@ const SadMxNx4Param x4d_avx2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 
-const SadMxNx8Param x8_avx2_tests[] = {
-  // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c),
-  SadMxNx8Param(32, 32, &vpx_sad32x32x8_avx2),
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, SADx8Test, ::testing::ValuesIn(x8_avx2_tests));
 #endif  // HAVE_AVX2
 
 #if HAVE_AVX512
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 32601b4eba..c7911032f6 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -216,11 +216,6 @@ ()
 #
 # Motion search
 #
-add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_full_search_sad sse3 sse4_1/;
-$vp8_full_search_sad_sse3=vp8_full_search_sadx3;
-$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
-
 add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
 specialize qw/vp8_refining_search_sad sse2 msa/;
 $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 4ab6c7b3d0..769c2f5589 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1280,10 +1280,10 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 }
 #endif  // HAVE_SSE2 || HAVE_MSA
 
-int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv) {
   unsigned char *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
   unsigned char *in_what;
@@ -1323,217 +1323,6 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   best_mv->as_mv.row = ref_row;
   best_mv->as_mv.col = ref_col;
 
-  /* Baseline value at the centre */
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
-            mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-  /* Apply further limits to prevent us looking using vectors that
-   * stretch beyiond the UMV border
-   */
-  if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; ++r) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-
-    for (c = col_min; c < col_max; ++c) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
-      this_mv.as_mv.col = c;
-      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-      if (thissad < bestsad) {
-        bestsad = thissad;
-        best_mv->as_mv.row = r;
-        best_mv->as_mv.col = c;
-        bestaddress = check_here;
-      }
-
-      check_here++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
-         mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-
-#if HAVE_SSSE3
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int pre_stride = x->e_mbd.pre.y_stride;
-  unsigned char *base_pre = x->e_mbd.pre.y_buffer;
-  int in_what_stride = pre_stride;
-  int mv_stride = pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.mv;
-  int_mv this_mv;
-  unsigned int bestsad;
-  unsigned int thissad;
-  int r, c;
-
-  unsigned char *check_here;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  unsigned int sad_array[3];
-
-  int *mvsadcost[2];
-  int_mv fcenter_mv;
-
-  mvsadcost[0] = x->mvsadcost[0];
-  mvsadcost[1] = x->mvsadcost[1];
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Work out the mid point for the search */
-  in_what = base_pre + d->offset;
-  bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  /* Baseline value at the centre */
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
-            mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-  /* Apply further limits to prevent us looking using vectors that stretch
-   * beyond the UMV border
-   */
-  if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; ++r) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; ++i) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
-      if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad +=
-            mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->as_mv.row = r;
-          best_mv->as_mv.col = c;
-          bestaddress = check_here;
-        }
-      }
-
-      check_here++;
-      c++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
-         mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-#endif  // HAVE_SSSE3
-
-#if HAVE_SSE4_1
-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  int pre_stride = x->e_mbd.pre.y_stride;
-  unsigned char *base_pre = x->e_mbd.pre.y_buffer;
-  unsigned char *in_what;
-  int in_what_stride = pre_stride;
-  int mv_stride = pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.mv;
-  int_mv this_mv;
-  unsigned int bestsad;
-  unsigned int thissad;
-  int r, c;
-
-  unsigned char *check_here;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
-  unsigned int sad_array[3];
-
-  int *mvsadcost[2];
-  int_mv fcenter_mv;
-
-  mvsadcost[0] = x->mvsadcost[0];
-  mvsadcost[1] = x->mvsadcost[1];
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Work out the mid point for the search */
-  in_what = base_pre + d->offset;
-  bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
   /* Baseline value at the centre */
   bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
             mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
@@ -1552,61 +1341,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   for (r = row_min; r < row_max; ++r) {
     this_mv.as_mv.row = r;
     check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 7) < col_max) {
-      int i;
-
-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
-      for (i = 0; i < 8; ++i) {
-        thissad = sad_array8[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; ++i) {
-        thissad = sad_array[i];
 
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
+    for (c = col_min; c < col_max; ++c) {
       thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
       if (thissad < bestsad) {
@@ -1623,7 +1359,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       }
 
       check_here++;
-      c++;
     }
   }
 
@@ -1633,7 +1368,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
-#endif  // HAVE_SSE4_1
 
 int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                               int_mv *ref_mv, int error_per_bit,
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 57c18f523f..1ee6fe5dd6 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -50,10 +50,10 @@ fractional_mv_step_fp vp8_find_best_sub_pixel_step;
 fractional_mv_step_fp vp8_find_best_half_pixel_step;
 fractional_mv_step_fp vp8_skip_fractional_mv_step;
 
-typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                                    int_mv *ref_mv, int sad_per_bit,
-                                    int distance, vp8_variance_fn_ptr_t *fn_ptr,
-                                    int *mvcost[2], int_mv *center_mv);
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv);
 
 typedef int (*vp8_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                         int_mv *ref_mv, int sad_per_bit,
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index f09177c7f5..ffb3867dd1 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2012,36 +2012,26 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
   cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
   cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
-  cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
-  cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
   cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
 
   cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
   cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
   cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
-  cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
-  cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
   cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
 
   cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
   cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
   cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
-  cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
-  cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
   cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
 
   cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
   cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
   cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
-  cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
-  cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
   cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
 
   cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
   cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
   cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
-  cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
-  cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
   cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
 
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
@@ -2052,7 +2042,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn;
 #endif
 
-  cpi->full_search_sad = vp8_full_search_sad;
   cpi->diamond_search_sad = vp8_diamond_search_sad;
   cpi->refining_search_sad = vp8_refining_search_sad;
 
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 7f8298e44a..424f51b180 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -549,7 +549,6 @@ typedef struct VP8_COMP {
   unsigned char *partition_d_end[MAX_PARTITIONS];
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp8_full_search_fn_t full_search_sad;
   vp8_refining_search_fn_t refining_search_sad;
   vp8_diamond_search_fn_t diamond_search_sad;
   vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 79a858e437..5821fc7346 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1097,8 +1097,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
             vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min,
                          x->mv_row_max);
 
-            thissme = cpi->full_search_sad(x, c, e, &mvp_full, sadpb, 16,
-                                           v_fn_ptr, x->mvcost, bsi->ref_mv);
+            thissme = vp8_full_search_sad(x, c, e, &mvp_full, sadpb, 16,
+                                          v_fn_ptr, x->mvcost, bsi->ref_mv);
 
             if (thissme < bestsme) {
               bestsme = thissme;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 4609a6bb26..df73190426 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1569,15 +1569,13 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-// TODO(angiebird): make sdx8f available for highbitdepth if needed
 #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
   cpi->fn_ptr[BT].sdf = SDF;                             \
   cpi->fn_ptr[BT].sdaf = SDAF;                           \
   cpi->fn_ptr[BT].vf = VF;                               \
   cpi->fn_ptr[BT].svf = SVF;                             \
   cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                       \
-  cpi->fn_ptr[BT].sdx8f = NULL;
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -2561,67 +2559,61 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX8F) \
-  cpi->fn_ptr[BT].sdf = SDF;                             \
-  cpi->fn_ptr[BT].sdaf = SDAF;                           \
-  cpi->fn_ptr[BT].vf = VF;                               \
-  cpi->fn_ptr[BT].svf = SVF;                             \
-  cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                       \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                      \
+  cpi->fn_ptr[BT].sdaf = SDAF;                    \
+  cpi->fn_ptr[BT].vf = VF;                        \
+  cpi->fn_ptr[BT].svf = SVF;                      \
+  cpi->fn_ptr[BT].svaf = SVAF;                    \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
-  // TODO(angiebird): make sdx8f available for every block size
   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
       vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
-      vpx_sad32x16x4d, NULL)
+      vpx_sad32x16x4d)
 
   BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
       vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
-      vpx_sad16x32x4d, NULL)
+      vpx_sad16x32x4d)
 
   BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
       vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
-      vpx_sad64x32x4d, NULL)
+      vpx_sad64x32x4d)
 
   BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
       vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
-      vpx_sad32x64x4d, NULL)
+      vpx_sad32x64x4d)
 
   BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
       vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x4d, vpx_sad32x32x8)
+      vpx_sad32x32x4d)
 
   BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
       vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x4d, NULL)
+      vpx_sad64x64x4d)
 
   BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
       vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x4d, vpx_sad16x16x8)
+      vpx_sad16x16x4d)
 
   BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
       vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
-      vpx_sad16x8x4d, vpx_sad16x8x8)
+      vpx_sad16x8x4d)
 
   BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
       vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
-      vpx_sad8x16x4d, vpx_sad8x16x8)
+      vpx_sad8x16x4d)
 
   BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
-      vpx_sad8x8x8)
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
 
   BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
-      NULL)
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
 
   BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
-      NULL)
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
 
   BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
-      vpx_sad4x4x8)
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index cd67064203..1f08aa5de7 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1796,29 +1796,6 @@ static int64_t exhaustive_mesh_search_single_step(
   end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
   for (r = start_row; r <= end_row; r += 1) {
     c = start_col;
-    // sdx8f may not be available some block size
-    if (fn_ptr->sdx8f) {
-      while (c + 7 <= end_col) {
-        unsigned int sads[8];
-        const MV mv = { r, c };
-        const uint8_t *buf = get_buf_from_mv(pre, &mv);
-        fn_ptr->sdx8f(src->buf, src->stride, buf, pre->stride, sads);
-
-        for (i = 0; i < 8; ++i) {
-          int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
-          if (sad < best_sad) {
-            const MV mv = { r, c + i };
-            sad += lambda *
-                   vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-        }
-        c += 8;
-      }
-    }
     while (c + 3 <= end_col) {
       unsigned int sads[4];
       const uint8_t *addrs[4];
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 0c4d8f23c6..bdaf2ce77d 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -93,16 +93,6 @@ extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
 extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 
-typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
-                                    int sad_per_bit, int distance,
-                                    const vp9_variance_fn_ptr_t *fn_ptr,
-                                    const MV *center_mv, MV *best_mv);
-
-typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv,
-                                        int sad_per_bit, int distance,
-                                        const vp9_variance_fn_ptr_t *fn_ptr,
-                                        const MV *center_mv);
-
 typedef int (*vp9_diamond_search_fn_t)(
     const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
     int search_param, int sad_per_bit, int *num00,
diff --git a/vpx_dsp/mips/sad_mmi.c b/vpx_dsp/mips/sad_mmi.c
index eaca4773f2..7f5882bca3 100644
--- a/vpx_dsp/mips/sad_mmi.c
+++ b/vpx_dsp/mips/sad_mmi.c
@@ -334,19 +334,6 @@
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
 #endif /* _MIPS_SIM == _ABIO32 */
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK_mmi(m, n, k)                                                 \
-  void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride,       \
-                                    const uint8_t *ref_array, int ref_stride, \
-                                    uint32_t *sad_array) {                    \
-    int i;                                                                    \
-    for (i = 0; i < (k); ++i)                                                 \
-      sad_array[i] =                                                          \
-          vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
-  }
-
-// This appears to be equivalent to the above when k == 4 and refs is const
 #define sadMxNx4D_mmi(m, n)                                                  \
   void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride,         \
                                  const uint8_t *const ref_array[],           \
@@ -583,10 +570,6 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
 vpx_sad16xN(32);
 vpx_sad16xN(16);
 vpx_sad16xN(8);
-sadMxNxK_mmi(16, 16, 3);
-sadMxNxK_mmi(16, 16, 8);
-sadMxNxK_mmi(16, 8, 3);
-sadMxNxK_mmi(16, 8, 8);
 sadMxNx4D_mmi(16, 32);
 sadMxNx4D_mmi(16, 16);
 sadMxNx4D_mmi(16, 8);
@@ -681,10 +664,6 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
 vpx_sad8xN(16);
 vpx_sad8xN(8);
 vpx_sad8xN(4);
-sadMxNxK_mmi(8, 16, 3);
-sadMxNxK_mmi(8, 16, 8);
-sadMxNxK_mmi(8, 8, 3);
-sadMxNxK_mmi(8, 8, 8);
 sadMxNx4D_mmi(8, 16);
 sadMxNx4D_mmi(8, 8);
 sadMxNx4D_mmi(8, 4);
@@ -777,8 +756,6 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
 
 vpx_sad4xN(8);
 vpx_sad4xN(4);
-sadMxNxK_mmi(4, 4, 3);
-sadMxNxK_mmi(4, 4, 8);
 sadMxNx4D_mmi(4, 8);
 sadMxNx4D_mmi(4, 4);
 
diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c
index e3e91c4330..b0f8ff1fd9 100644
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c
@@ -159,380 +159,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
   return sad;
 }
 
-static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref, ref0, ref1, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref0, ref1, ref;
-  v16u8 diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *const aref_ptr[],
                                int32_t ref_stride, int32_t height,
@@ -1037,48 +663,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
   }
 
-#define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t sads[3]) {                     \
-    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t sads[3]) {                     \
-    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t sads[3]) {                     \
-    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t sads[8]) {                     \
-    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t sads[8]) {                     \
-    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t sads[8]) {                     \
-    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
                                   const uint8_t *const refs[4],           \
@@ -1186,29 +770,21 @@ VPX_AVGSAD_16xHEIGHT_MSA(32);
 
 // 16x16
 VPX_SAD_16xHEIGHT_MSA(16);
-VPX_SAD_16xHEIGHTx3_MSA(16);
-VPX_SAD_16xHEIGHTx8_MSA(16);
 VPX_SAD_16xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_16xHEIGHT_MSA(16);
 
 // 16x8
 VPX_SAD_16xHEIGHT_MSA(8);
-VPX_SAD_16xHEIGHTx3_MSA(8);
-VPX_SAD_16xHEIGHTx8_MSA(8);
 VPX_SAD_16xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_16xHEIGHT_MSA(8);
 
 // 8x16
 VPX_SAD_8xHEIGHT_MSA(16);
-VPX_SAD_8xHEIGHTx3_MSA(16);
-VPX_SAD_8xHEIGHTx8_MSA(16);
 VPX_SAD_8xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_8xHEIGHT_MSA(16);
 
 // 8x8
 VPX_SAD_8xHEIGHT_MSA(8);
-VPX_SAD_8xHEIGHTx3_MSA(8);
-VPX_SAD_8xHEIGHTx8_MSA(8);
 VPX_SAD_8xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_8xHEIGHT_MSA(8);
 
@@ -1224,7 +800,5 @@ VPX_AVGSAD_4xHEIGHT_MSA(8);
 
 // 4x4
 VPX_SAD_4xHEIGHT_MSA(4);
-VPX_SAD_4xHEIGHTx3_MSA(4);
-VPX_SAD_4xHEIGHTx8_MSA(4);
 VPX_SAD_4xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_4xHEIGHT_MSA(4);
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 46d513b686..b47c43430d 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -45,35 +45,7 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
   }
 
-// Compare |src_ptr| to |k| adjacent blocks starting at |ref_ptr|.
-// |k| == {3,8}. Used in vp8 for an exhaustive search.
-// src:           ref:
-//  0  1  2  3     0  1  2  3  x  x
-//  4  5  6  7     6  7  8  9  x  x
-//  8  9 10 11    12 13 14 15  x  x
-// 12 13 14 15    18 19 20 21  x  x
-//
-//                 x  1  2  3  4  x
-//                 x  7  8  9 10  x
-//                 x 13 14 15 16  x
-//                 x 19 20 21 22  x
-//
-//                 x  x  2  3  4  5
-//                 x  x  8  9 10 11
-//                 x  x 14 15 16 17
-//                 x  x 20 21 22 23
-//
-#define sadMxNxK(m, n, k)                                                     \
-  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride,     \
-                                  const uint8_t *ref_ptr, int ref_stride,     \
-                                  uint32_t sad_array[k]) {                    \
-    int i;                                                                    \
-    for (i = 0; i < k; ++i)                                                   \
-      sad_array[i] =                                                          \
-          vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_ptr + i, ref_stride); \
-  }
-
-// Compare |src_ptr| to 4 distinct references in |ref_array[]|
+// Compare |src_ptr| to 4 distinct references in |ref_array[4]|
 #define sadMxNx4D(m, n)                                                        \
   void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \
                                const uint8_t *const ref_array[4],              \
@@ -99,7 +71,6 @@ sadMxNx4D(32, 64)
 
 // 32x32
 sadMxN(32, 32)
-sadMxNxK(32, 32, 8)
 sadMxNx4D(32, 32)
 
 // 32x16
@@ -112,26 +83,18 @@ sadMxNx4D(16, 32)
 
 // 16x16
 sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
 sadMxNx4D(16, 16)
 
 // 16x8
 sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
 sadMxNx4D(16, 8)
 
 // 8x16
 sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
 sadMxNx4D(8, 16)
 
 // 8x8
 sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
 sadMxNx4D(8, 8)
 
 // 8x4
@@ -144,8 +107,6 @@ sadMxNx4D(4, 8)
 
 // 4x4
 sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
 /* clang-format on */
 
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index f8b44f03d1..755cb907d2 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -59,8 +59,6 @@ typedef struct variance_vtable {
   vpx_sad_fn_t sdf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
-  vpx_sad_multi_fn_t sdx3f;
-  vpx_sad_multi_fn_t sdx8f;
   vpx_sad_multi_d_fn_t sdx4df;
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
   vp8_copy32xn_fn_t copymem;
@@ -76,7 +74,6 @@ typedef struct vp9_variance_vtable {
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
   vpx_sad_multi_d_fn_t sdx4df;
-  vpx_sad_multi_fn_t sdx8f;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a880e1d285..a1e511cce0 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -358,9 +358,6 @@ DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 DSP_SRCS-$(HAVE_MMI)    += mips/sad_mmi.c
 DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
 
-DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
-DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
-DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 06a8febb29..91ff884a62 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -873,44 +873,6 @@ ()
 add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
 
-#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-# Blocks of 3
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
-
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
-
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
-
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
-
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
-
-# Blocks of 8
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad32x32x8 avx2/;
-
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
-
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 5f1f757e25..6c5c8ebc87 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -68,63 +68,6 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums, sad_array);
 }
 
-void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride,
-                         uint32_t sad_array[8]) {
-  int i;
-  __m256i sums[8];
-
-  sums[0] = _mm256_setzero_si256();
-  sums[1] = _mm256_setzero_si256();
-  sums[2] = _mm256_setzero_si256();
-  sums[3] = _mm256_setzero_si256();
-  sums[4] = _mm256_setzero_si256();
-  sums[5] = _mm256_setzero_si256();
-  sums[6] = _mm256_setzero_si256();
-  sums[7] = _mm256_setzero_si256();
-
-  for (i = 0; i < 32; i++) {
-    __m256i r[8];
-
-    // load src and all ref[]
-    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
-    r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
-    r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
-    r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
-    r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
-    r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
-    r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
-    r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
-    r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
-
-    // sum of the absolute differences between every ref[] to src
-    r[0] = _mm256_sad_epu8(r[0], s);
-    r[1] = _mm256_sad_epu8(r[1], s);
-    r[2] = _mm256_sad_epu8(r[2], s);
-    r[3] = _mm256_sad_epu8(r[3], s);
-    r[4] = _mm256_sad_epu8(r[4], s);
-    r[5] = _mm256_sad_epu8(r[5], s);
-    r[6] = _mm256_sad_epu8(r[6], s);
-    r[7] = _mm256_sad_epu8(r[7], s);
-
-    // sum every ref[]
-    sums[0] = _mm256_add_epi32(sums[0], r[0]);
-    sums[1] = _mm256_add_epi32(sums[1], r[1]);
-    sums[2] = _mm256_add_epi32(sums[2], r[2]);
-    sums[3] = _mm256_add_epi32(sums[3], r[3]);
-    sums[4] = _mm256_add_epi32(sums[4], r[4]);
-    sums[5] = _mm256_add_epi32(sums[5], r[5]);
-    sums[6] = _mm256_add_epi32(sums[6], r[6]);
-    sums[7] = _mm256_add_epi32(sums[7], r[7]);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  calc_final_4(sums, sad_array);
-  calc_final_4(sums + 4, sad_array + 4);
-}
-
 void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t sad_array[4]) {
diff --git a/vpx_dsp/x86/sad_sse3.asm b/vpx_dsp/x86/sad_sse3.asm
deleted file mode 100644
index acbd2e4fa1..0000000000
--- a/vpx_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,376 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x16x3_sse3)
-sym(vpx_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x8x3_sse3)
-sym(vpx_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad8x16x3_sse3)
-sym(vpx_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad8x8x3_sse3)
-sym(vpx_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad4x4x3_sse3)
-sym(vpx_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
diff --git a/vpx_dsp/x86/sad_sse4.asm b/vpx_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 0818ed5f0a..0000000000
--- a/vpx_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,361 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
-    mov             rdi,        arg(4)           ;Results
-    pxor            xmm0, xmm0
-    movdqa          xmm2, xmm1
-    punpcklwd       xmm1, xmm0
-    punpckhwd       xmm2, xmm0
-
-    movdqa          [rdi],    xmm1
-    movdqa          [rdi + 16],    xmm2
-%endmacro
-
-SECTION .text
-
-;void vpx_sad16x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-globalsym(vpx_sad16x16x8_sse4_1)
-sym(vpx_sad16x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad16x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad16x8x8_sse4_1)
-sym(vpx_sad16x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad8x8x8_sse4_1)
-sym(vpx_sad8x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad8x16x8_sse4_1)
-sym(vpx_sad8x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad4x4x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad4x4x8_sse4_1)
-sym(vpx_sad4x4x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_4X2X8 1
-    PROCESS_4X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/vpx_dsp/x86/sad_ssse3.asm b/vpx_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index a5bc6d7306..0000000000
--- a/vpx_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x16x3_ssse3)
-sym(vpx_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x16x3_ssse3_skiptable
-.vpx_sad16x16x3_ssse3_jumptable:
-        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_skiptable:
-
-        call .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
-
-.vpx_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int vpx_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x8x3_ssse3)
-sym(vpx_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x8x3_ssse3_skiptable
-.vpx_sad16x8x3_ssse3_jumptable:
-        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_skiptable:
-
-        call .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
-
-.vpx_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

From 02808ecbccf4fa385a700cffdd1aac796f6f37ca Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Tue, 29 Mar 2022 12:40:12 +0900
Subject: [PATCH 230/926] remove skip_block from quantize

Whether a block is skipped is handled by mi->skip. x->skip_block
is kept exclusively to verify that the quantize functions are not
called for skip blocks.

Finishes the cleanup in 13eed991f

Bug: libvpx:1612
Change-Id: I1598c3b682d3c5e6c57a15fa4cb5df2c65b3a58a
---
 test/vp9_quantize_test.cc                     |  89 ++++-----
 vp9/common/vp9_rtcd_defs.pl                   |   8 +-
 vp9/encoder/arm/neon/vp9_quantize_neon.c      |  15 +-
 vp9/encoder/ppc/vp9_quantize_vsx.c            |  15 +-
 vp9/encoder/vp9_encodemb.c                    | 187 ++++++++----------
 vp9/encoder/vp9_encoder.c                     |  19 +-
 vp9/encoder/vp9_pickmode.c                    |  15 +-
 vp9/encoder/vp9_quantize.c                    |  45 ++---
 vp9/encoder/x86/vp9_quantize_avx2.c           |  11 +-
 vp9/encoder/x86/vp9_quantize_sse2.c           |  11 +-
 vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm |   4 +-
 vpx_dsp/arm/quantize_neon.c                   |  10 +-
 vpx_dsp/ppc/quantize_vsx.c                    |  22 +--
 vpx_dsp/quantize.c                            |  78 ++++----
 vpx_dsp/quantize.h                            |  13 +-
 vpx_dsp/vpx_dsp_rtcd_defs.pl                  |   8 +-
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c     |  10 +-
 vpx_dsp/x86/quantize_avx.c                    |  11 +-
 vpx_dsp/x86/quantize_sse2.c                   |   6 +-
 vpx_dsp/x86/quantize_ssse3.c                  |  10 +-
 20 files changed, 252 insertions(+), 335 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index cb4481b103..d54f1bc9cd 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -38,26 +38,24 @@ namespace {
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             int skip_block, const int16_t *zbin,
-                             const int16_t *round, const int16_t *quant,
-                             const int16_t *quant_shift, tran_low_t *qcoeff,
-                             tran_low_t *dqcoeff, const int16_t *dequant,
-                             uint16_t *eob, const int16_t *scan,
-                             const int16_t *iscan);
+                             const int16_t *zbin, const int16_t *round,
+                             const int16_t *quant, const int16_t *quant_shift,
+                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                             const int16_t *dequant, uint16_t *eob,
+                             const int16_t *scan, const int16_t *iscan);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
-                               int skip_block, const int16_t *round,
-                               const int16_t *quant, tran_low_t *qcoeff,
-                               tran_low_t *dqcoeff, const int16_t *dequant,
-                               uint16_t *eob, const int16_t *scan,
-                               const int16_t *iscan);
+                               const int16_t *round, const int16_t *quant,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               const int16_t *dequant, uint16_t *eob,
+                               const int16_t *scan, const int16_t *iscan);
 
 template <QuantizeFPFunc fn>
-void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
+void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
                     const int16_t *zbin, const int16_t *round,
                     const int16_t *quant, const int16_t *quant_shift,
                     tran_low_t *qcoeff, tran_low_t *dqcoeff,
@@ -66,8 +64,7 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
   (void)zbin;
   (void)quant_shift;
 
-  fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob,
-     scan, iscan);
+  fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
 }
 
 class VP9QuantizeBase : public AbstractBench {
@@ -138,7 +135,6 @@ class VP9QuantizeBase : public AbstractBench {
   int16_t *r_ptr_;
   int16_t *q_ptr_;
   int count_;
-  int skip_block_;
   const scan_order *scan_;
   uint16_t eob_;
 };
@@ -157,8 +153,8 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 };
 
 void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_,
-               q_ptr_, quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+  quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+               quant_shift_ptr_, qcoeff_.TopLeftPixel(),
                dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
                scan_->iscan);
 }
@@ -167,16 +163,14 @@ void VP9QuantizeTest::Run() {
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
 inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *round_ptr,
-                        const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan, int is_32x32) {
+                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const int16_t *scan, const int16_t *iscan,
+                        int is_32x32) {
   int i, eob = -1;
   const int thr = dequant_ptr[1] >> (1 + is_32x32);
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
@@ -243,22 +237,20 @@ inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr,
               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
 }
 
 void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan,
-                            const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr,
               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
 }
 
@@ -316,9 +308,6 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
   eob_ = 0;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    // Test skip block for the first three iterations to catch all the different
-    // sizes.
-    const int skip_block = 0;
     TX_SIZE sz;
     if (max_size_ == 16) {
       sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
@@ -332,13 +321,13 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_,
-                     r_ptr_, q_ptr_, quant_shift_ptr_,
-                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                     scan_->scan, scan_->iscan);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_, r_ptr_, q_ptr_,
+        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
         quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
         dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
 
@@ -372,7 +361,6 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
   const uint32_t max_index = max_size_ * max_size_ - 1;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    skip_block_ = 0;
     TX_SIZE sz;
     if (max_size_ == 16) {
       sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
@@ -391,13 +379,13 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_,
-                     r_ptr_, q_ptr_, quant_shift_ptr_,
-                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                     scan_->scan, scan_->iscan);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_, q_ptr_,
+        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
         quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
         dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
 
@@ -433,7 +421,6 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
   for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
     // zbin > coeff, zbin < coeff.
     for (int i = 0; i < 2; ++i) {
-      skip_block_ = 0;
       // TX_TYPE defines the scan order. That is not relevant to the speed test.
       // Pick the first one.
       const TX_TYPE tx_type = DCT_DCT;
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 6980b9b7fb..5146121a8d 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -128,10 +128,10 @@ ()
 
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 
-add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64";
 
-add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64";
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -195,9 +195,9 @@ ()
 
   # ENCODEMB INVOKE
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index d75a481796..236c3176c7 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -43,11 +43,10 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
 }
 
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   int i;
@@ -59,8 +58,6 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
   int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // adjust for dc
   v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
@@ -138,7 +135,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
 }
 
 void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *round_ptr,
+                                const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -167,8 +164,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
   uint16x8_t eob_max;
   (void)scan;
   (void)count;
-  (void)skip_block;
-  assert(!skip_block);
 
   // coeff * quant_ptr[]) >> 15
   qcoeff = vqdmulhq_s16(qcoeff, quant);
diff --git a/vp9/encoder/ppc/vp9_quantize_vsx.c b/vp9/encoder/ppc/vp9_quantize_vsx.c
index 4f88b8fff6..4d31558471 100644
--- a/vp9/encoder/ppc/vp9_quantize_vsx.c
+++ b/vp9/encoder/ppc/vp9_quantize_vsx.c
@@ -39,11 +39,10 @@ static INLINE int16x8_t vec_max_across(int16x8_t a) {
 }
 
 void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *round_ptr,
-                         const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
   int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
   bool16x8_t zero_coeff0, zero_coeff1;
 
@@ -56,8 +55,6 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int16x8_t scan1 = vec_vsx_ld(16, iscan);
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // First set of 8 coeff starts with DC + 7 AC
   qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
@@ -165,7 +162,7 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
 }
 
 void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               int skip_block, const int16_t *round_ptr,
+                               const int16_t *round_ptr,
                                const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                                tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -194,9 +191,7 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int16x8_t abs_coeff1 = vec_abs(coeff1);
 
   (void)scan;
-  (void)skip_block;
   (void)n_coeffs;
-  assert(!skip_block);
 
   mask0 = vec_cmpge(abs_coeff0, thres);
   round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 7630a81103..fa222f9dcf 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -366,28 +366,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp,
-                                     p->quant_fp, qcoeff, dqcoeff, pd->dequant,
-                                     eob, scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp,
+                                     qcoeff, dqcoeff, pd->dequant, eob,
+                                     scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
+                               dqcoeff, pd->dequant, eob, scan_order->scan,
+                               scan_order->iscan);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
+                               dqcoeff, pd->dequant, eob, scan_order->scan,
+                               scan_order->iscan);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
+                               dqcoeff, pd->dequant, eob, scan_order->scan,
+                               scan_order->iscan);
         break;
     }
     return;
@@ -397,29 +397,26 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp,
-                            p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                            scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff,
+                            dqcoeff, pd->dequant, eob, scan_order->scan,
+                            scan_order->iscan);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
 
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
       break;
   }
 }
@@ -444,28 +441,24 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                                     p->quant_fp[0], qcoeff, dqcoeff,
-                                     pd->dequant[0], eob);
+        vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff,
+                                     dqcoeff, pd->dequant[0], eob);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
     }
     return;
@@ -475,24 +468,24 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       vpx_fdct32x32_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round, p->quant_fp[0],
-                            qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
       break;
     case TX_16X16:
       vpx_fdct16x16_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
     case TX_8X8:
       vpx_fdct8x8_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
   }
 }
@@ -518,32 +511,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, eob, scan_order->scan,
-                                    scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(
+            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
+            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
     }
     return;
@@ -553,29 +542,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, eob, scan_order->scan,
-                           scan_order->iscan);
+      vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                           p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                           scan_order->scan, scan_order->iscan);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
+                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                     scan_order->iscan);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                     scan_order->iscan);
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                     scan_order->iscan);
       break;
   }
 }
@@ -869,10 +857,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                      p->round, p->quant, p->quant_shift,
-                                      qcoeff, dqcoeff, pd->dequant, eob,
-                                      scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(
+              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
+              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -889,10 +876,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -910,10 +896,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -932,10 +917,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -964,10 +948,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, eob, scan_order->scan,
-                             scan_order->iscan);
+        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                             scan_order->scan, scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -980,9 +963,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -995,9 +978,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -1014,9 +997,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
           x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 4609a6bb26..97805fc164 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -6620,19 +6620,22 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
-                                 p->quant_fp, qcoeff, dqcoeff, pd->dequant,
-                                 &eob, scan_order->scan, scan_order->iscan);
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
+                                 qcoeff, dqcoeff, pd->dequant, &eob,
+                                 scan_order->scan, scan_order->iscan);
   } else {
-    vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
-                          p->quant_fp, qcoeff, dqcoeff, pd->dequant, &eob,
-                          scan_order->scan, scan_order->iscan);
+    vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
+                          dqcoeff, pd->dequant, &eob, scan_order->scan,
+                          scan_order->iscan);
   }
 #else
-  vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, p->quant_fp,
-                        qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan,
+  vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
+                        dqcoeff, pd->dequant, &eob, scan_order->scan,
                         scan_order->iscan);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index c8e167f25b..697c589ab3 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -771,24 +771,27 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
         const int16_t *src_diff;
         src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
+        // skip block condition should be handled before this is called.
+        assert(!x->skip_block);
+
         switch (tx_size) {
           case TX_16X16:
             vpx_hadamard_16x16(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+            vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
+                            dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
           case TX_8X8:
             vpx_hadamard_8x8(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+            vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
+                            dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
           default:
             assert(tx_size == TX_4X4);
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-            vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+            vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
+                            dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
         }
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index c996b75167..9058997b0f 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -22,15 +22,12 @@
 #include "vp9/encoder/vp9_rd.h"
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *round_ptr,
-                       const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                       tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                       uint16_t *eob_ptr, const int16_t *scan,
-                       const int16_t *iscan) {
+                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -56,7 +53,7 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *round_ptr,
+                              const int16_t *round_ptr,
                               const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                               tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -65,8 +62,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int eob = -1;
 
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -91,15 +86,12 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *round_ptr,
-                             const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                             tran_low_t *dqcoeff_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -126,15 +118,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+    const int16_t *iscan) {
   int i, eob = -1;
 
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -176,16 +166,15 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs,
-                          x->skip_block, p->zbin, p->round, p->quant,
-                          p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                          &p->eobs[block], scan, iscan);
+    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin,
+                          p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
+                          pd->dequant, &p->eobs[block], scan, iscan);
     return;
   }
 #endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block,
-                 p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
-                 pd->dequant, &p->eobs[block], scan, iscan);
+  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round,
+                 p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                 &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index 8dfdbd50f6..db18b1a7a4 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -47,18 +47,15 @@ static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
 }
 
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   __m128i eob;
   __m256i round256, quant256, dequant256;
   __m256i eob256, thr256;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index e3d803b8f0..4bcadaa6a1 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -18,11 +18,10 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   __m128i zero;
   __m128i thr;
   int nzflag;
@@ -30,8 +29,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i round, quant, dequant;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 5703aa3bb6..680acfec69 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -19,18 +19,18 @@ pw_1: times 8 dw 1
 SECTION .text
 
 %macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \
                                 qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
 
   ; actual quantize loop - setup pointers, rounders, etc.
   movifnidn                   coeffq, coeffmp
   movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
   movifnidn                   roundq, roundmp
   movifnidn                   quantq, quantmp
   mova                            m1, [roundq]             ; m1 = round
   mova                            m2, [quantq]             ; m2 = quant
+  mov                             r2, dequantmp
 %ifidn %1, fp_32x32
   pcmpeqw                         m5, m5
   psrlw                           m5, 15
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index adef5f6e15..bd7818a074 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -32,8 +32,8 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
 }
 
 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan,
@@ -42,8 +42,6 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Process first 8 values which include a dc component.
   {
@@ -189,7 +187,7 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               int skip_block, const int16_t *zbin_ptr,
+                               const int16_t *zbin_ptr,
                                const int16_t *round_ptr,
                                const int16_t *quant_ptr,
                                const int16_t *quant_shift_ptr,
@@ -202,8 +200,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int i;
   (void)scan;
   (void)n_coeffs;  // Because we will always calculate 32*32.
-  (void)skip_block;
-  assert(!skip_block);
 
   // Process first 8 values which include a dc component.
   {
diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c
index d85e63bd14..7cdcbeb405 100644
--- a/vpx_dsp/ppc/quantize_vsx.c
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -95,8 +95,8 @@ static INLINE int16x8_t vec_max_across(int16x8_t a) {
 }
 
 void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *zbin_ptr,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan_ptr,
@@ -122,8 +122,6 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   zero_mask1 = vec_cmpge(coeff1_abs, zbin);
 
   (void)scan_ptr;
-  (void)skip_block;
-  assert(!skip_block);
 
   qcoeff0 =
       quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
@@ -196,12 +194,14 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = eob[0];
 }
 
-void vpx_quantize_b_32x32_vsx(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan_ptr,
+                              const int16_t *iscan_ptr) {
   // In stage 1, we quantize 16 coeffs (DC + 15 AC)
   // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
   // (32 * 32 - 16) / 24 = 42
@@ -227,9 +227,7 @@ void vpx_quantize_b_32x32_vsx(
   int16x8_t coeff1_abs = vec_abs(coeff1);
 
   (void)scan_ptr;
-  (void)skip_block;
   (void)n_coeffs;
-  assert(!skip_block);
 
   // 32x32 quantization requires that zbin and round be divided by 2
   zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16);
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 61818f692e..5d6ba64a8a 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -15,7 +15,7 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant, uint16_t *eob_ptr) {
@@ -28,28 +28,26 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-    if (tmp) eob = 0;
-  }
+  tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+  tmp = (tmp * quant) >> 16;
+  qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+  dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+  if (tmp) eob = 0;
+
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
-                            uint16_t *eob_ptr) {
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant, uint16_t *eob_ptr) {
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  {
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
@@ -59,11 +57,12 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
     if (abs_qcoeff) eob = 0;
   }
+
   *eob_ptr = eob + 1;
 }
 #endif
 
-void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant, uint16_t *eob_ptr) {
@@ -77,19 +76,18 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
-    if (tmp) eob = 0;
-  }
+  tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN,
+              INT16_MAX);
+  tmp = (tmp * quant) >> 15;
+  qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+  dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
+  if (tmp) eob = 0;
+
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
@@ -100,7 +98,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  {
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
@@ -110,23 +108,21 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
     if (abs_qcoeff) eob = 0;
   }
+
   *eob_ptr = eob + 1;
 }
 #endif
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *zbin_ptr,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
-                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -166,8 +162,8 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -176,8 +172,6 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -215,8 +209,8 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
                             const int16_t *quant_shift_ptr,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -229,8 +223,6 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -277,8 +269,8 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
@@ -290,8 +282,6 @@ void vpx_highbd_quantize_b_32x32_c(
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 7cac140e9d..8e138445e2 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -18,22 +18,21 @@
 extern "C" {
 #endif
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant, uint16_t *eob_ptr);
-void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant, uint16_t *eob_ptr);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
-                            uint16_t *eob_ptr);
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant, uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 06a8febb29..372903aff2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -709,17 +709,17 @@ ()
 # Quantization
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b sse2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 7149e4fb74..4535a0f7a2 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -18,7 +18,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
@@ -39,8 +39,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
@@ -94,8 +92,8 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
@@ -107,8 +105,6 @@ void vpx_highbd_quantize_b_32x32_sse2(
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 0a91d36eaf..706e4e6413 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -21,8 +21,8 @@
 #include "vpx_dsp/x86/quantize_ssse3.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *zbin_ptr,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan,
@@ -39,8 +39,6 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   *eob_ptr = 0;
 
@@ -145,8 +143,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *zbin_ptr,
-                              const int16_t *round_ptr,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
                               const int16_t *quant_ptr,
                               const int16_t *quant_shift_ptr,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -166,8 +163,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   (void)scan;
   (void)n_coeffs;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   // The 32x32 halves zbin and round.
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index e38a4059ab..459d95f28b 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -18,8 +18,8 @@
 #include "vpx_dsp/x86/quantize_sse2.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan,
@@ -34,8 +34,6 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index fc1d91959f..9d2a88b7bc 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -18,8 +18,8 @@
 #include "vpx_dsp/x86/quantize_ssse3.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -34,8 +34,6 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
                 dequant_ptr, &dequant, quant_shift_ptr, &shift);
@@ -111,7 +109,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
@@ -131,8 +129,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   (void)scan;
   (void)n_coeffs;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   // The 32x32 halves zbin and round.

From 1239be9e5faf3f7c1603548ed669af35639b74f8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 29 Mar 2022 20:00:00 -0700
Subject: [PATCH 231/926] sad4d_avx2: fix VS 2014 build error

after:
d60b671a7 gcc 11 warning: mismatched bound

error C2719: 'sums': formal parameter with requested alignment of 32
won't be aligned

Change-Id: Iaba46d00ef2334a5e2d9ee69b5d03478fdc73a60
---
 vpx_dsp/x86/sad4d_avx2.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 5f1f757e25..81f1a916f0 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -11,7 +11,12 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE void calc_final_4(const __m256i sums[4], uint32_t sad_array[4]) {
+// Note with sums[4] some versions of Visual Studio may fail due to parameter
+// alignment, though the functions should be equivalent:
+// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
+// aligned
+static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+                                uint32_t sad_array[4]) {
   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
   const __m256i t2 = _mm256_hadd_epi32(t0, t1);

From 247658efb0a29fbcd66a84ef67aab2794e517380 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Wed, 16 Mar 2022 16:27:27 +0200
Subject: [PATCH 232/926] Optimize FHT functions for NEON

[NEON]
Optimize vp9_fht4x4, vp9_fht8x8, vp9_fht16x16 for NEON

Following change #3516278, the improvement for these functions is:

Before:
     4.10%     0.75%  vpxenc   vpxenc              [.] vp9_fht16x16_c
     2.93%     0.65%  vpxenc   vpxenc              [.] vp9_fht8x8_c
     0.93%     0.77%  vpxenc   vpxenc              [.] vp9_fht4x4_c

And after the patch:

     0.69%     0.16%  vpxenc   vpxenc              [.] vp9_fht16x16_neon
     0.28%     0.28%  vpxenc   vpxenc              [.] vp9_fht8x8_neon
     0.54%     0.53%  vpxenc   vpxenc              [.] vp9_fht4x4_neon

Bug: webm:1634
Change-Id: I6748a0c4e0cfaafa3eefdd4848d0ac3aab6900e4
---
 test/dct_test.cc                    |    5 +-
 vp9/common/vp9_rtcd_defs.pl         |    6 +-
 vp9/encoder/arm/neon/vp9_dct_neon.c | 1460 +++++++++++++++++++++++++++
 vp9/vp9cx.mk                        |    1 +
 4 files changed, 1468 insertions(+), 4 deletions(-)
 create mode 100644 vp9/encoder/arm/neon/vp9_dct_neon.c

diff --git a/test/dct_test.cc b/test/dct_test.cc
index 9541869535..20e081a24c 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -641,8 +641,11 @@ static const FuncInfo ht_neon_func_info[] = {
     &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
 #endif
   { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
+  { &vp9_fht4x4_neon, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
   { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
-  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
+  { &vp9_fht8x8_neon, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 },
+  { &vp9_fht16x16_neon, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
 };
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 6980b9b7fb..951da7dbea 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -159,9 +159,9 @@ ()
 
 # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH
 # is off.
-specialize qw/vp9_fht4x4 sse2/;
-specialize qw/vp9_fht8x8 sse2/;
-specialize qw/vp9_fht16x16 sse2/;
+specialize qw/vp9_fht4x4 sse2 neon/;
+specialize qw/vp9_fht8x8 sse2 neon/;
+specialize qw/vp9_fht16x16 sse2 neon/;
 specialize qw/vp9_fwht4x4 sse2/;
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
   # Note that these specializations are appended to the above ones.
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 0000000000..a07a1608d7
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,1460 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
+                                   int stride) {
+  // { 0, 1, 1, 1, 1, 1, 1, 1 };
+  const int16x8_t nonzero_bias_a = vextq_s16(vdupq_n_s16(0), vdupq_n_s16(1), 7);
+  // { 1, 0, 0, 0, 0, 0, 0, 0 };
+  const int16x8_t nonzero_bias_b = vextq_s16(vdupq_n_s16(1), vdupq_n_s16(0), 7);
+  int16x8_t mask;
+
+  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  in[0] = vcombine_s16(input_0, input_1);
+  in[1] = vcombine_s16(input_2, input_3);
+
+  // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+  // one non-zero first elements
+  mask = vreinterpretq_s16_u16(vceqq_s16(in[0], nonzero_bias_a));
+  in[0] = vaddq_s16(in[0], mask);
+  in[0] = vaddq_s16(in[0], nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
+  const int16x8_t one_s16 = vdupq_n_s16(1);
+  res[0] = vaddq_s16(res[0], one_s16);
+  res[1] = vaddq_s16(res[1], one_s16);
+  res[0] = vshrq_n_s16(res[0], 2);
+  res[1] = vshrq_n_s16(res[1], 2);
+  store_s16q_to_tran_low(output + 0 * 8, res[0]);
+  store_s16q_to_tran_low(output + 1 * 8, res[1]);
+}
+
+static INLINE void fadst4x4_neon(int16x8_t *in) {
+  int32x4_t u0, u1, u2, u3;
+  int16x4_t out_0, out_1, out_2, out_3;
+  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+
+  const int16x4_t s0 = vget_low_s16(in[0]);   // | x_00 | x_01 | x_02 | x_03 |
+  const int16x4_t s1 = vget_high_s16(in[0]);  // | x_10 | x_11 | x_12 | x_13 |
+  const int16x4_t s2 = vget_low_s16(in[1]);   // | x_20 | x_21 | x_22 | x_23 |
+  const int16x4_t s3 = vget_high_s16(in[1]);  // | x_30 | x_31 | x_32 | x_33 |
+
+  // s0 * sinpi_1_9, s0 * sinpi_4_9
+  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+  const int32x4_t s0s1_9 = vmull_n_s16(s0, sinpi_1_9);
+  const int32x4_t s0s4_9 = vmull_n_s16(s0, sinpi_4_9);
+  // s1 * sinpi_1_9, s1 * sinpi_2_9
+  const int32x4_t s1s1_9 = vmull_n_s16(s1, sinpi_1_9);
+  const int32x4_t s1s2_9 = vmull_n_s16(s1, sinpi_2_9);
+  // s2 * sinpi_3_9
+  const int32x4_t s2s3_9 = vmull_n_s16(s2, sinpi_3_9);
+  // s3 * sinpi_2_9, s3 * sinpi_4_9
+  const int32x4_t s3s2_9 = vmull_n_s16(s3, sinpi_2_9);
+  const int32x4_t s3s4_9 = vmull_n_s16(s3, sinpi_4_9);
+
+  // (s0 + s1) * sinpi_3_9
+  const int32x4_t s0_p_s1 = vaddl_s16(s0, s1);
+  const int32x4_t s0_p_s1_m_s3 = vsubw_s16(s0_p_s1, s3);
+
+  // s_0 * sinpi_1_9 + s_1 * sinpi_2_9
+  // s_0 * sinpi_4_9 - s_1 * sinpi_1_9
+  const int32x4_t s0s1_9_p_s1s2_9 = vaddq_s32(s0s1_9, s1s2_9);
+  const int32x4_t s0s4_9_m_s1s1_9 = vsubq_s32(s0s4_9, s1s1_9);
+  /*
+   * t0 = s0s1_9 + s1s2_9 + s3s4_9
+   * t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+   * t2 = s0s4_9 - s1s1_9 + s3s2_9
+   * t3 = s2s3_9
+   */
+  const int32x4_t t0 = vaddq_s32(s0s1_9_p_s1s2_9, s3s4_9);
+  const int32x4_t t1 = vmulq_n_s32(s0_p_s1_m_s3, sinpi_3_9);
+  const int32x4_t t2 = vaddq_s32(s0s4_9_m_s1s1_9, s3s2_9);
+  const int32x4_t t3 = s2s3_9;
+  /*
+   * u0 = t0 + t3
+   * u1 = t1
+   * u2 = t2 - t3
+   * u3 = t2 - t0 + t3
+   */
+  u0 = vaddq_s32(t0, t3);
+  u1 = t1;
+  u2 = vsubq_s32(t2, t3);
+  u3 = vaddq_s32(vsubq_s32(t2, t0), t3);
+
+  // fdct_round_shift
+  u0 = vaddq_s32(u0, k__DCT_CONST_ROUNDING);
+  u1 = vaddq_s32(u1, k__DCT_CONST_ROUNDING);
+  u2 = vaddq_s32(u2, k__DCT_CONST_ROUNDING);
+  u3 = vaddq_s32(u3, k__DCT_CONST_ROUNDING);
+
+  out_0 = vshrn_n_s32(u0, DCT_CONST_BITS);
+  out_1 = vshrn_n_s32(u1, DCT_CONST_BITS);
+  out_2 = vshrn_n_s32(u2, DCT_CONST_BITS);
+  out_3 = vshrn_n_s32(u3, DCT_CONST_BITS);
+
+  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+  in[0] = vcombine_s16(out_0, out_1);
+  in[1] = vcombine_s16(out_2, out_3);
+}
+
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  int16x8_t in[2];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct4x4_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
+      fadst4x4_neon(in);
+      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
+      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      fadst4x4_neon(in);
+      write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_4x4(input, in, stride);
+      fadst4x4_neon(in);
+      fadst4x4_neon(in);
+      write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, int16x8_t *in,
+                                   int stride) {
+  in[0] = vshlq_n_s16(vld1q_s16(input + 0 * stride), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(input + 1 * stride), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(input + 2 * stride), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(input + 3 * stride), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(input + 4 * stride), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(input + 5 * stride), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(input + 6 * stride), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(input + 7 * stride), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void right_shift_8x8(int16x8_t *res, const int bit) {
+  int16x8_t sign0 = vshrq_n_s16(res[0], 15);
+  int16x8_t sign1 = vshrq_n_s16(res[1], 15);
+  int16x8_t sign2 = vshrq_n_s16(res[2], 15);
+  int16x8_t sign3 = vshrq_n_s16(res[3], 15);
+  int16x8_t sign4 = vshrq_n_s16(res[4], 15);
+  int16x8_t sign5 = vshrq_n_s16(res[5], 15);
+  int16x8_t sign6 = vshrq_n_s16(res[6], 15);
+  int16x8_t sign7 = vshrq_n_s16(res[7], 15);
+
+  if (bit == 2) {
+    const int16x8_t const_rounding = vdupq_n_s16(1);
+    res[0] = vaddq_s16(res[0], const_rounding);
+    res[1] = vaddq_s16(res[1], const_rounding);
+    res[2] = vaddq_s16(res[2], const_rounding);
+    res[3] = vaddq_s16(res[3], const_rounding);
+    res[4] = vaddq_s16(res[4], const_rounding);
+    res[5] = vaddq_s16(res[5], const_rounding);
+    res[6] = vaddq_s16(res[6], const_rounding);
+    res[7] = vaddq_s16(res[7], const_rounding);
+  }
+
+  res[0] = vsubq_s16(res[0], sign0);
+  res[1] = vsubq_s16(res[1], sign1);
+  res[2] = vsubq_s16(res[2], sign2);
+  res[3] = vsubq_s16(res[3], sign3);
+  res[4] = vsubq_s16(res[4], sign4);
+  res[5] = vsubq_s16(res[5], sign5);
+  res[6] = vsubq_s16(res[6], sign6);
+  res[7] = vsubq_s16(res[7], sign7);
+
+  if (bit == 1) {
+    res[0] = vshrq_n_s16(res[0], 1);
+    res[1] = vshrq_n_s16(res[1], 1);
+    res[2] = vshrq_n_s16(res[2], 1);
+    res[3] = vshrq_n_s16(res[3], 1);
+    res[4] = vshrq_n_s16(res[4], 1);
+    res[5] = vshrq_n_s16(res[5], 1);
+    res[6] = vshrq_n_s16(res[6], 1);
+    res[7] = vshrq_n_s16(res[7], 1);
+  } else {
+    res[0] = vshrq_n_s16(res[0], 2);
+    res[1] = vshrq_n_s16(res[1], 2);
+    res[2] = vshrq_n_s16(res[2], 2);
+    res[3] = vshrq_n_s16(res[3], 2);
+    res[4] = vshrq_n_s16(res[4], 2);
+    res[5] = vshrq_n_s16(res[5], 2);
+    res[6] = vshrq_n_s16(res[6], 2);
+    res[7] = vshrq_n_s16(res[7], 2);
+  }
+}
+
+static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res,
+                                    int stride) {
+  store_s16q_to_tran_low(output + 0 * stride, res[0]);
+  store_s16q_to_tran_low(output + 1 * stride, res[1]);
+  store_s16q_to_tran_low(output + 2 * stride, res[2]);
+  store_s16q_to_tran_low(output + 3 * stride, res[3]);
+  store_s16q_to_tran_low(output + 4 * stride, res[4]);
+  store_s16q_to_tran_low(output + 5 * stride, res[5]);
+  store_s16q_to_tran_low(output + 6 * stride, res[6]);
+  store_s16q_to_tran_low(output + 7 * stride, res[7]);
+}
+
+static INLINE void fadst8x8_neon(int16x8_t *in) {
+  int16x4_t x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi, x4_lo,
+      x4_hi, x5_lo, x5_hi, x6_lo, x6_hi, x7_lo, x7_hi;
+  int32x4_t s0_lo, s0_hi, s1_lo, s1_hi, s2_lo, s2_hi, s3_lo, s3_hi, s4_lo,
+      s4_hi, s5_lo, s5_hi, s6_lo, s6_hi, s7_lo, s7_hi;
+  int32x4_t t0_lo, t0_hi, t1_lo, t1_hi, t2_lo, t2_hi, t3_lo, t3_hi, t4_lo,
+      t4_hi, t5_lo, t5_hi, t6_lo, t6_hi, t7_lo, t7_hi;
+  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+
+  x0_lo = vget_low_s16(in[7]);
+  x0_hi = vget_high_s16(in[7]);
+  x1_lo = vget_low_s16(in[0]);
+  x1_hi = vget_high_s16(in[0]);
+  x2_lo = vget_low_s16(in[5]);
+  x2_hi = vget_high_s16(in[5]);
+  x3_lo = vget_low_s16(in[2]);
+  x3_hi = vget_high_s16(in[2]);
+  x4_lo = vget_low_s16(in[3]);
+  x4_hi = vget_high_s16(in[3]);
+  x5_lo = vget_low_s16(in[4]);
+  x5_hi = vget_high_s16(in[4]);
+  x6_lo = vget_low_s16(in[1]);
+  x6_hi = vget_high_s16(in[1]);
+  x7_lo = vget_low_s16(in[6]);
+  x7_hi = vget_high_s16(in[6]);
+
+  // stage 1
+  // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  s0_lo = vaddq_s32(vmull_n_s16(x0_lo, cospi_2_64),
+                    vmull_n_s16(x1_lo, cospi_30_64));
+  s0_hi = vaddq_s32(vmull_n_s16(x0_hi, cospi_2_64),
+                    vmull_n_s16(x1_hi, cospi_30_64));
+  // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  s1_lo = vsubq_s32(vmull_n_s16(x0_lo, cospi_30_64),
+                    vmull_n_s16(x1_lo, cospi_2_64));
+  s1_hi = vsubq_s32(vmull_n_s16(x0_hi, cospi_30_64),
+                    vmull_n_s16(x1_hi, cospi_2_64));
+  // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s2_lo = vaddq_s32(vmull_n_s16(x2_lo, cospi_10_64),
+                    vmull_n_s16(x3_lo, cospi_22_64));
+  s2_hi = vaddq_s32(vmull_n_s16(x2_hi, cospi_10_64),
+                    vmull_n_s16(x3_hi, cospi_22_64));
+  // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s3_lo = vsubq_s32(vmull_n_s16(x2_lo, cospi_22_64),
+                    vmull_n_s16(x3_lo, cospi_10_64));
+  s3_hi = vsubq_s32(vmull_n_s16(x2_hi, cospi_22_64),
+                    vmull_n_s16(x3_hi, cospi_10_64));
+  // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s4_lo = vaddq_s32(vmull_n_s16(x4_lo, cospi_18_64),
+                    vmull_n_s16(x5_lo, cospi_14_64));
+  s4_hi = vaddq_s32(vmull_n_s16(x4_hi, cospi_18_64),
+                    vmull_n_s16(x5_hi, cospi_14_64));
+  // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s5_lo = vsubq_s32(vmull_n_s16(x4_lo, cospi_14_64),
+                    vmull_n_s16(x5_lo, cospi_18_64));
+  s5_hi = vsubq_s32(vmull_n_s16(x4_hi, cospi_14_64),
+                    vmull_n_s16(x5_hi, cospi_18_64));
+  // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  s6_lo = vaddq_s32(vmull_n_s16(x6_lo, cospi_26_64),
+                    vmull_n_s16(x7_lo, cospi_6_64));
+  s6_hi = vaddq_s32(vmull_n_s16(x6_hi, cospi_26_64),
+                    vmull_n_s16(x7_hi, cospi_6_64));
+  // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  s7_lo = vsubq_s32(vmull_n_s16(x6_lo, cospi_6_64),
+                    vmull_n_s16(x7_lo, cospi_26_64));
+  s7_hi = vsubq_s32(vmull_n_s16(x6_hi, cospi_6_64),
+                    vmull_n_s16(x7_hi, cospi_26_64));
+
+  // fdct_round_shift
+  t0_lo = vaddq_s32(s0_lo, s4_lo);
+  t0_hi = vaddq_s32(s0_hi, s4_hi);
+  t1_lo = vaddq_s32(s1_lo, s5_lo);
+  t1_hi = vaddq_s32(s1_hi, s5_hi);
+  t2_lo = vaddq_s32(s2_lo, s6_lo);
+  t2_hi = vaddq_s32(s2_hi, s6_hi);
+  t3_lo = vaddq_s32(s3_lo, s7_lo);
+  t3_hi = vaddq_s32(s3_hi, s7_hi);
+  t4_lo = vsubq_s32(s0_lo, s4_lo);
+  t4_hi = vsubq_s32(s0_hi, s4_hi);
+  t5_lo = vsubq_s32(s1_lo, s5_lo);
+  t5_hi = vsubq_s32(s1_hi, s5_hi);
+  t6_lo = vsubq_s32(s2_lo, s6_lo);
+  t6_hi = vsubq_s32(s2_hi, s6_hi);
+  t7_lo = vsubq_s32(s3_lo, s7_lo);
+  t7_hi = vsubq_s32(s3_hi, s7_hi);
+
+  t0_lo = vaddq_s32(t0_lo, k__DCT_CONST_ROUNDING);
+  t0_hi = vaddq_s32(t0_hi, k__DCT_CONST_ROUNDING);
+  t1_lo = vaddq_s32(t1_lo, k__DCT_CONST_ROUNDING);
+  t1_hi = vaddq_s32(t1_hi, k__DCT_CONST_ROUNDING);
+  t2_lo = vaddq_s32(t2_lo, k__DCT_CONST_ROUNDING);
+  t2_hi = vaddq_s32(t2_hi, k__DCT_CONST_ROUNDING);
+  t3_lo = vaddq_s32(t3_lo, k__DCT_CONST_ROUNDING);
+  t3_hi = vaddq_s32(t3_hi, k__DCT_CONST_ROUNDING);
+  t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
+  t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
+  t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
+  t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
+  t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
+  t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
+  t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
+  t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
+
+  t0_lo = vshrq_n_s32(t0_lo, DCT_CONST_BITS);
+  t0_hi = vshrq_n_s32(t0_hi, DCT_CONST_BITS);
+  t1_lo = vshrq_n_s32(t1_lo, DCT_CONST_BITS);
+  t1_hi = vshrq_n_s32(t1_hi, DCT_CONST_BITS);
+  t2_lo = vshrq_n_s32(t2_lo, DCT_CONST_BITS);
+  t2_hi = vshrq_n_s32(t2_hi, DCT_CONST_BITS);
+  t3_lo = vshrq_n_s32(t3_lo, DCT_CONST_BITS);
+  t3_hi = vshrq_n_s32(t3_hi, DCT_CONST_BITS);
+  t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
+  t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
+  t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
+  t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
+  t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
+  t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
+  t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
+  t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+
+  // stage 2
+  s0_lo = t0_lo;
+  s0_hi = t0_hi;
+  s1_lo = t1_lo;
+  s1_hi = t1_hi;
+  s2_lo = t2_lo;
+  s2_hi = t2_hi;
+  s3_lo = t3_lo;
+  s3_hi = t3_hi;
+  s4_lo = vaddq_s32(vmulq_n_s32(t4_lo, cospi_8_64),
+                    vmulq_n_s32(t5_lo, cospi_24_64));
+  s4_hi = vaddq_s32(vmulq_n_s32(t4_hi, cospi_8_64),
+                    vmulq_n_s32(t5_hi, cospi_24_64));
+  s5_lo = vsubq_s32(vmulq_n_s32(t4_lo, cospi_24_64),
+                    vmulq_n_s32(t5_lo, cospi_8_64));
+  s5_hi = vsubq_s32(vmulq_n_s32(t4_hi, cospi_24_64),
+                    vmulq_n_s32(t5_hi, cospi_8_64));
+  s6_lo = vaddq_s32(vmulq_n_s32(t6_lo, -cospi_24_64),
+                    vmulq_n_s32(t7_lo, cospi_8_64));
+  s6_hi = vaddq_s32(vmulq_n_s32(t6_hi, -cospi_24_64),
+                    vmulq_n_s32(t7_hi, cospi_8_64));
+  s7_lo = vaddq_s32(vmulq_n_s32(t6_lo, cospi_8_64),
+                    vmulq_n_s32(t7_lo, cospi_24_64));
+  s7_hi = vaddq_s32(vmulq_n_s32(t6_hi, cospi_8_64),
+                    vmulq_n_s32(t7_hi, cospi_24_64));
+
+  // s0 + s2
+  t0_lo = vaddq_s32(s0_lo, s2_lo);
+  t0_hi = vaddq_s32(s0_hi, s2_hi);
+  // s1 + s3
+  t1_lo = vaddq_s32(s1_lo, s3_lo);
+  t1_hi = vaddq_s32(s1_hi, s3_hi);
+  // s0 - s2
+  t2_lo = vsubq_s32(s0_lo, s2_lo);
+  t2_hi = vsubq_s32(s0_hi, s2_hi);
+  // s1 - s3
+  t3_lo = vsubq_s32(s1_lo, s3_lo);
+  t3_hi = vsubq_s32(s1_hi, s3_hi);
+  // s4 + s6
+  t4_lo = vaddq_s32(s4_lo, s6_lo);
+  t4_hi = vaddq_s32(s4_hi, s6_hi);
+  // s5 + s7
+  t5_lo = vaddq_s32(s5_lo, s7_lo);
+  t5_hi = vaddq_s32(s5_hi, s7_hi);
+  // s4 - s6
+  t6_lo = vsubq_s32(s4_lo, s6_lo);
+  t6_hi = vsubq_s32(s4_hi, s6_hi);
+  // s5 - s7
+  t7_lo = vsubq_s32(s5_lo, s7_lo);
+  t7_hi = vsubq_s32(s5_hi, s7_hi);
+
+  // fdct_round_shift
+  t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
+  t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
+  t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
+  t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
+  t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
+  t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
+  t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
+  t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
+  t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
+  t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
+  t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
+  t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
+  t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
+  t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
+  t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
+  t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+
+  // stage 3
+  // cospi_16_64 * (x2 + x3)
+  s2_lo = vmulq_n_s32(vaddq_s32(t2_lo, t3_lo), cospi_16_64);
+  s2_hi = vmulq_n_s32(vaddq_s32(t2_hi, t3_hi), cospi_16_64);
+  // cospi_16_64 * (x2 - x3)
+  s3_lo = vmulq_n_s32(vsubq_s32(t2_lo, t3_lo), cospi_16_64);
+  s3_hi = vmulq_n_s32(vsubq_s32(t2_hi, t3_hi), cospi_16_64);
+  // cospi_16_64 * (x6 + x7)
+  s6_lo = vmulq_n_s32(vaddq_s32(t6_lo, t7_lo), cospi_16_64);
+  s6_hi = vmulq_n_s32(vaddq_s32(t6_hi, t7_hi), cospi_16_64);
+  // cospi_16_64 * (x2 - x3)
+  s7_lo = vmulq_n_s32(vsubq_s32(t6_lo, t7_lo), cospi_16_64);
+  s7_hi = vmulq_n_s32(vsubq_s32(t6_hi, t7_hi), cospi_16_64);
+
+  // final fdct_round_shift
+  t2_lo = vaddq_s32(s2_lo, k__DCT_CONST_ROUNDING);
+  t2_hi = vaddq_s32(s2_hi, k__DCT_CONST_ROUNDING);
+  t3_lo = vaddq_s32(s3_lo, k__DCT_CONST_ROUNDING);
+  t3_hi = vaddq_s32(s3_hi, k__DCT_CONST_ROUNDING);
+  t6_lo = vaddq_s32(s6_lo, k__DCT_CONST_ROUNDING);
+  t6_hi = vaddq_s32(s6_hi, k__DCT_CONST_ROUNDING);
+  t7_lo = vaddq_s32(s7_lo, k__DCT_CONST_ROUNDING);
+  t7_hi = vaddq_s32(s7_hi, k__DCT_CONST_ROUNDING);
+
+  x2_lo = vshrn_n_s32(t2_lo, DCT_CONST_BITS);
+  x2_hi = vshrn_n_s32(t2_hi, DCT_CONST_BITS);
+  x3_lo = vshrn_n_s32(t3_lo, DCT_CONST_BITS);
+  x3_hi = vshrn_n_s32(t3_hi, DCT_CONST_BITS);
+  x6_lo = vshrn_n_s32(t6_lo, DCT_CONST_BITS);
+  x6_hi = vshrn_n_s32(t6_hi, DCT_CONST_BITS);
+  x7_lo = vshrn_n_s32(t7_lo, DCT_CONST_BITS);
+  x7_hi = vshrn_n_s32(t7_hi, DCT_CONST_BITS);
+
+  // x0, x1, x4, x5 narrow down to 16-bits directly
+  x0_lo = vmovn_s32(t0_lo);
+  x0_hi = vmovn_s32(t0_hi);
+  x1_lo = vmovn_s32(t1_lo);
+  x1_hi = vmovn_s32(t1_hi);
+  x4_lo = vmovn_s32(t4_lo);
+  x4_hi = vmovn_s32(t4_hi);
+  x5_lo = vmovn_s32(t5_lo);
+  x5_hi = vmovn_s32(t5_hi);
+
+  in[0] = vcombine_s16(x0_lo, x0_hi);
+  in[1] = vnegq_s16(vcombine_s16(x4_lo, x4_hi));
+  in[2] = vcombine_s16(x6_lo, x6_hi);
+  in[3] = vnegq_s16(vcombine_s16(x2_lo, x2_hi));
+  in[4] = vcombine_s16(x3_lo, x3_hi);
+  in[5] = vnegq_s16(vcombine_s16(x7_lo, x7_hi));
+  in[6] = vcombine_s16(x5_lo, x5_hi);
+  in[7] = vnegq_s16(vcombine_s16(x1_lo, x1_hi));
+
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+}
+
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  int16x8_t in[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct8x8_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
+      fadst8x8_neon(in);
+      vpx_fdct8x8_pass1_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
+      vpx_fdct8x8_pass1_neon(in);
+      fadst8x8_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_8x8(input, in, stride);
+      fadst8x8_neon(in);
+      fadst8x8_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+  }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int16x8_t *in0,
+                                     int16x8_t *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, int16x8_t *in0,
+                                      int16x8_t *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void right_shift_16x16(int16x8_t *res0, int16x8_t *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(int16x8_t *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  int16x8_t i[8], s1[8], s2[8], s3[8], t[8];
+  int16x4_t t_lo[8], t_hi[8];
+  int32x4_t u_lo[8], u_hi[8];
+  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = vaddq_s16(in[0], in[15]);
+  i[1] = vaddq_s16(in[1], in[14]);
+  i[2] = vaddq_s16(in[2], in[13]);
+  i[3] = vaddq_s16(in[3], in[12]);
+  i[4] = vaddq_s16(in[4], in[11]);
+  i[5] = vaddq_s16(in[5], in[10]);
+  i[6] = vaddq_s16(in[6], in[9]);
+  i[7] = vaddq_s16(in[7], in[8]);
+
+  vpx_fdct8x8_pass1_neon(i);
+  transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
+
+  // step 2
+  s1[0] = vsubq_s16(in[7], in[8]);
+  s1[1] = vsubq_s16(in[6], in[9]);
+  s1[2] = vsubq_s16(in[5], in[10]);
+  s1[3] = vsubq_s16(in[4], in[11]);
+  s1[4] = vsubq_s16(in[3], in[12]);
+  s1[5] = vsubq_s16(in[2], in[13]);
+  s1[6] = vsubq_s16(in[1], in[14]);
+  s1[7] = vsubq_s16(in[0], in[15]);
+
+  t[2] = vsubq_s16(s1[5], s1[2]);
+  t[3] = vsubq_s16(s1[4], s1[3]);
+  t[4] = vaddq_s16(s1[4], s1[3]);
+  t[5] = vaddq_s16(s1[5], s1[2]);
+
+  t_lo[2] = vget_low_s16(t[2]);
+  t_hi[2] = vget_high_s16(t[2]);
+  t_lo[3] = vget_low_s16(t[3]);
+  t_hi[3] = vget_high_s16(t[3]);
+  t_lo[4] = vget_low_s16(t[4]);
+  t_hi[4] = vget_high_s16(t[4]);
+  t_lo[5] = vget_low_s16(t[5]);
+  t_hi[5] = vget_high_s16(t[5]);
+
+  u_lo[2] = vmull_n_s16(t_lo[2], cospi_16_64);
+  u_hi[2] = vmull_n_s16(t_hi[2], cospi_16_64);
+  u_lo[3] = vmull_n_s16(t_lo[3], cospi_16_64);
+  u_hi[3] = vmull_n_s16(t_hi[3], cospi_16_64);
+  u_lo[4] = vmull_n_s16(t_lo[4], cospi_16_64);
+  u_hi[4] = vmull_n_s16(t_hi[4], cospi_16_64);
+  u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64);
+  u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64);
+
+  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
+  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
+  u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
+  u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
+  u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
+  u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
+  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
+  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
+
+  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+
+  s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+  s2[3] = vcombine_s16(t_lo[3], t_hi[3]);
+  s2[4] = vcombine_s16(t_lo[4], t_hi[4]);
+  s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+
+  // step 3
+  s3[0] = vaddq_s16(s1[0], s2[3]);
+  s3[1] = vaddq_s16(s1[1], s2[2]);
+  s3[2] = vsubq_s16(s1[1], s2[2]);
+  s3[3] = vsubq_s16(s1[0], s2[3]);
+  s3[4] = vsubq_s16(s1[7], s2[4]);
+  s3[5] = vsubq_s16(s1[6], s2[5]);
+  s3[6] = vaddq_s16(s1[6], s2[5]);
+  s3[7] = vaddq_s16(s1[7], s2[4]);
+
+  // step 4
+  t_lo[0] = vget_low_s16(s3[0]);
+  t_hi[0] = vget_high_s16(s3[0]);
+  t_lo[1] = vget_low_s16(s3[1]);
+  t_hi[1] = vget_high_s16(s3[1]);
+  t_lo[2] = vget_low_s16(s3[2]);
+  t_hi[2] = vget_high_s16(s3[2]);
+  t_lo[3] = vget_low_s16(s3[3]);
+  t_hi[3] = vget_high_s16(s3[3]);
+  t_lo[4] = vget_low_s16(s3[4]);
+  t_hi[4] = vget_high_s16(s3[4]);
+  t_lo[5] = vget_low_s16(s3[5]);
+  t_hi[5] = vget_high_s16(s3[5]);
+  t_lo[6] = vget_low_s16(s3[6]);
+  t_hi[6] = vget_high_s16(s3[6]);
+  t_lo[7] = vget_low_s16(s3[7]);
+  t_hi[7] = vget_high_s16(s3[7]);
+
+  u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_8_64),
+                      vmull_n_s16(t_lo[6], cospi_24_64));
+  u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_8_64),
+                      vmull_n_s16(t_hi[6], cospi_24_64));
+  u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_24_64),
+                      vmull_n_s16(t_lo[5], cospi_8_64));
+  u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_24_64),
+                      vmull_n_s16(t_hi[5], cospi_8_64));
+  u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_8_64),
+                      vmull_n_s16(t_lo[5], -cospi_24_64));
+  u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_8_64),
+                      vmull_n_s16(t_hi[5], -cospi_24_64));
+  u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_24_64),
+                      vmull_n_s16(t_lo[6], cospi_8_64));
+  u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_24_64),
+                      vmull_n_s16(t_hi[6], cospi_8_64));
+
+  u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
+  u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
+  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
+  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
+  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
+  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
+  u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
+  u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
+
+  t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+
+  s2[1] = vcombine_s16(t_lo[1], t_hi[1]);
+  s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+  s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+  s2[6] = vcombine_s16(t_lo[6], t_hi[6]);
+
+  // step 5
+  s1[0] = vaddq_s16(s3[0], s2[1]);
+  s1[1] = vsubq_s16(s3[0], s2[1]);
+  s1[2] = vaddq_s16(s3[3], s2[2]);
+  s1[3] = vsubq_s16(s3[3], s2[2]);
+  s1[4] = vsubq_s16(s3[4], s2[5]);
+  s1[5] = vaddq_s16(s3[4], s2[5]);
+  s1[6] = vsubq_s16(s3[7], s2[6]);
+  s1[7] = vaddq_s16(s3[7], s2[6]);
+
+  // step 6
+  t_lo[0] = vget_low_s16(s1[0]);
+  t_hi[0] = vget_high_s16(s1[0]);
+  t_lo[1] = vget_low_s16(s1[1]);
+  t_hi[1] = vget_high_s16(s1[1]);
+  t_lo[2] = vget_low_s16(s1[2]);
+  t_hi[2] = vget_high_s16(s1[2]);
+  t_lo[3] = vget_low_s16(s1[3]);
+  t_hi[3] = vget_high_s16(s1[3]);
+  t_lo[4] = vget_low_s16(s1[4]);
+  t_hi[4] = vget_high_s16(s1[4]);
+  t_lo[5] = vget_low_s16(s1[5]);
+  t_hi[5] = vget_high_s16(s1[5]);
+  t_lo[6] = vget_low_s16(s1[6]);
+  t_hi[6] = vget_high_s16(s1[6]);
+  t_lo[7] = vget_low_s16(s1[7]);
+  t_hi[7] = vget_high_s16(s1[7]);
+
+  // step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+  u_lo[0] = vaddq_s32(vmull_n_s16(t_lo[0], cospi_30_64),
+                      vmull_n_s16(t_lo[7], cospi_2_64));
+  u_hi[0] = vaddq_s32(vmull_n_s16(t_hi[0], cospi_30_64),
+                      vmull_n_s16(t_hi[7], cospi_2_64));
+
+  // step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+  u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_14_64),
+                      vmull_n_s16(t_lo[6], cospi_18_64));
+  u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_14_64),
+                      vmull_n_s16(t_hi[6], cospi_18_64));
+
+  // step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_22_64),
+                      vmull_n_s16(t_lo[5], cospi_10_64));
+  u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_22_64),
+                      vmull_n_s16(t_hi[5], cospi_10_64));
+
+  // step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+  u_lo[3] = vaddq_s32(vmull_n_s16(t_lo[3], cospi_6_64),
+                      vmull_n_s16(t_lo[4], cospi_26_64));
+  u_hi[3] = vaddq_s32(vmull_n_s16(t_hi[3], cospi_6_64),
+                      vmull_n_s16(t_hi[4], cospi_26_64));
+
+  // step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+  u_lo[4] = vaddq_s32(vmull_n_s16(t_lo[3], -cospi_26_64),
+                      vmull_n_s16(t_lo[4], cospi_6_64));
+  u_hi[4] = vaddq_s32(vmull_n_s16(t_hi[3], -cospi_26_64),
+                      vmull_n_s16(t_hi[4], cospi_6_64));
+
+  // step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+  u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], -cospi_10_64),
+                      vmull_n_s16(t_lo[5], cospi_22_64));
+  u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], -cospi_10_64),
+                      vmull_n_s16(t_hi[5], cospi_22_64));
+
+  // step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+  u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_18_64),
+                      vmull_n_s16(t_lo[6], cospi_14_64));
+  u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_18_64),
+                      vmull_n_s16(t_hi[6], cospi_14_64));
+
+  // step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+  u_lo[7] = vaddq_s32(vmull_n_s16(t_lo[0], -cospi_2_64),
+                      vmull_n_s16(t_lo[7], cospi_30_64));
+  u_hi[7] = vaddq_s32(vmull_n_s16(t_hi[0], -cospi_2_64),
+                      vmull_n_s16(t_hi[7], cospi_30_64));
+
+  // final fdct_round_shift
+  u_lo[0] = vaddq_s32(u_lo[0], k__DCT_CONST_ROUNDING);
+  u_hi[0] = vaddq_s32(u_hi[0], k__DCT_CONST_ROUNDING);
+  u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
+  u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
+  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
+  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
+  u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
+  u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
+  u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
+  u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
+  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
+  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
+  u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
+  u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
+  u_lo[7] = vaddq_s32(u_lo[7], k__DCT_CONST_ROUNDING);
+  u_hi[7] = vaddq_s32(u_hi[7], k__DCT_CONST_ROUNDING);
+
+  t_lo[0] = vshrn_n_s32(u_lo[0], DCT_CONST_BITS);
+  t_hi[0] = vshrn_n_s32(u_hi[0], DCT_CONST_BITS);
+  t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vshrn_n_s32(u_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vshrn_n_s32(u_hi[7], DCT_CONST_BITS);
+
+  in[0] = i[0];
+  in[2] = i[1];
+  in[4] = i[2];
+  in[6] = i[3];
+  in[8] = i[4];
+  in[10] = i[5];
+  in[12] = i[6];
+  in[14] = i[7];
+  in[1] = vcombine_s16(t_lo[0], t_hi[0]);
+  in[3] = vcombine_s16(t_lo[4], t_hi[4]);
+  in[5] = vcombine_s16(t_lo[2], t_hi[2]);
+  in[7] = vcombine_s16(t_lo[6], t_hi[6]);
+  in[9] = vcombine_s16(t_lo[1], t_hi[1]);
+  in[11] = vcombine_s16(t_lo[5], t_hi[5]);
+  in[13] = vcombine_s16(t_lo[3], t_hi[3]);
+  in[15] = vcombine_s16(t_lo[7], t_hi[7]);
+}
+
+static void fadst16_8col(int16x8_t *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  int16x4_t x_lo[16], x_hi[16];
+  int32x4_t s_lo[16], s_hi[16];
+  int32x4_t t_lo[16], t_hi[16];
+  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+
+  x_lo[0] = vget_low_s16(in[15]);
+  x_hi[0] = vget_high_s16(in[15]);
+  x_lo[1] = vget_low_s16(in[0]);
+  x_hi[1] = vget_high_s16(in[0]);
+  x_lo[2] = vget_low_s16(in[13]);
+  x_hi[2] = vget_high_s16(in[13]);
+  x_lo[3] = vget_low_s16(in[2]);
+  x_hi[3] = vget_high_s16(in[2]);
+  x_lo[4] = vget_low_s16(in[11]);
+  x_hi[4] = vget_high_s16(in[11]);
+  x_lo[5] = vget_low_s16(in[4]);
+  x_hi[5] = vget_high_s16(in[4]);
+  x_lo[6] = vget_low_s16(in[9]);
+  x_hi[6] = vget_high_s16(in[9]);
+  x_lo[7] = vget_low_s16(in[6]);
+  x_hi[7] = vget_high_s16(in[6]);
+  x_lo[8] = vget_low_s16(in[7]);
+  x_hi[8] = vget_high_s16(in[7]);
+  x_lo[9] = vget_low_s16(in[8]);
+  x_hi[9] = vget_high_s16(in[8]);
+  x_lo[10] = vget_low_s16(in[5]);
+  x_hi[10] = vget_high_s16(in[5]);
+  x_lo[11] = vget_low_s16(in[10]);
+  x_hi[11] = vget_high_s16(in[10]);
+  x_lo[12] = vget_low_s16(in[3]);
+  x_hi[12] = vget_high_s16(in[3]);
+  x_lo[13] = vget_low_s16(in[12]);
+  x_hi[13] = vget_high_s16(in[12]);
+  x_lo[14] = vget_low_s16(in[1]);
+  x_hi[14] = vget_high_s16(in[1]);
+  x_lo[15] = vget_low_s16(in[14]);
+  x_hi[15] = vget_high_s16(in[14]);
+
+  // stage 1
+  // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+  s_lo[0] = vaddq_s32(vmull_n_s16(x_lo[0], cospi_1_64),
+                      vmull_n_s16(x_lo[1], cospi_31_64));
+  s_hi[0] = vaddq_s32(vmull_n_s16(x_hi[0], cospi_1_64),
+                      vmull_n_s16(x_hi[1], cospi_31_64));
+  // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+  s_lo[1] = vsubq_s32(vmull_n_s16(x_lo[0], cospi_31_64),
+                      vmull_n_s16(x_lo[1], cospi_1_64));
+  s_hi[1] = vsubq_s32(vmull_n_s16(x_hi[0], cospi_31_64),
+                      vmull_n_s16(x_hi[1], cospi_1_64));
+  // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+  s_lo[2] = vaddq_s32(vmull_n_s16(x_lo[2], cospi_5_64),
+                      vmull_n_s16(x_lo[3], cospi_27_64));
+  s_hi[2] = vaddq_s32(vmull_n_s16(x_hi[2], cospi_5_64),
+                      vmull_n_s16(x_hi[3], cospi_27_64));
+  // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+  s_lo[3] = vsubq_s32(vmull_n_s16(x_lo[2], cospi_27_64),
+                      vmull_n_s16(x_lo[3], cospi_5_64));
+  s_hi[3] = vsubq_s32(vmull_n_s16(x_hi[2], cospi_27_64),
+                      vmull_n_s16(x_hi[3], cospi_5_64));
+  // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+  s_lo[4] = vaddq_s32(vmull_n_s16(x_lo[4], cospi_9_64),
+                      vmull_n_s16(x_lo[5], cospi_23_64));
+  s_hi[4] = vaddq_s32(vmull_n_s16(x_hi[4], cospi_9_64),
+                      vmull_n_s16(x_hi[5], cospi_23_64));
+  // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+  s_lo[5] = vsubq_s32(vmull_n_s16(x_lo[4], cospi_23_64),
+                      vmull_n_s16(x_lo[5], cospi_9_64));
+  s_hi[5] = vsubq_s32(vmull_n_s16(x_hi[4], cospi_23_64),
+                      vmull_n_s16(x_hi[5], cospi_9_64));
+  // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+  s_lo[6] = vaddq_s32(vmull_n_s16(x_lo[6], cospi_13_64),
+                      vmull_n_s16(x_lo[7], cospi_19_64));
+  s_hi[6] = vaddq_s32(vmull_n_s16(x_hi[6], cospi_13_64),
+                      vmull_n_s16(x_hi[7], cospi_19_64));
+  // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+  s_lo[7] = vsubq_s32(vmull_n_s16(x_lo[6], cospi_19_64),
+                      vmull_n_s16(x_lo[7], cospi_13_64));
+  s_hi[7] = vsubq_s32(vmull_n_s16(x_hi[6], cospi_19_64),
+                      vmull_n_s16(x_hi[7], cospi_13_64));
+  // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+  s_lo[8] = vaddq_s32(vmull_n_s16(x_lo[8], cospi_17_64),
+                      vmull_n_s16(x_lo[9], cospi_15_64));
+  s_hi[8] = vaddq_s32(vmull_n_s16(x_hi[8], cospi_17_64),
+                      vmull_n_s16(x_hi[9], cospi_15_64));
+  // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+  s_lo[9] = vsubq_s32(vmull_n_s16(x_lo[8], cospi_15_64),
+                      vmull_n_s16(x_lo[9], cospi_17_64));
+  s_hi[9] = vsubq_s32(vmull_n_s16(x_hi[8], cospi_15_64),
+                      vmull_n_s16(x_hi[9], cospi_17_64));
+  // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+  s_lo[10] = vaddq_s32(vmull_n_s16(x_lo[10], cospi_21_64),
+                       vmull_n_s16(x_lo[11], cospi_11_64));
+  s_hi[10] = vaddq_s32(vmull_n_s16(x_hi[10], cospi_21_64),
+                       vmull_n_s16(x_hi[11], cospi_11_64));
+  // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+  s_lo[11] = vsubq_s32(vmull_n_s16(x_lo[10], cospi_11_64),
+                       vmull_n_s16(x_lo[11], cospi_21_64));
+  s_hi[11] = vsubq_s32(vmull_n_s16(x_hi[10], cospi_11_64),
+                       vmull_n_s16(x_hi[11], cospi_21_64));
+  // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+  s_lo[12] = vaddq_s32(vmull_n_s16(x_lo[12], cospi_25_64),
+                       vmull_n_s16(x_lo[13], cospi_7_64));
+  s_hi[12] = vaddq_s32(vmull_n_s16(x_hi[12], cospi_25_64),
+                       vmull_n_s16(x_hi[13], cospi_7_64));
+  // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+  s_lo[13] = vsubq_s32(vmull_n_s16(x_lo[12], cospi_7_64),
+                       vmull_n_s16(x_lo[13], cospi_25_64));
+  s_hi[13] = vsubq_s32(vmull_n_s16(x_hi[12], cospi_7_64),
+                       vmull_n_s16(x_hi[13], cospi_25_64));
+  // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+  s_lo[14] = vaddq_s32(vmull_n_s16(x_lo[14], cospi_29_64),
+                       vmull_n_s16(x_lo[15], cospi_3_64));
+  s_hi[14] = vaddq_s32(vmull_n_s16(x_hi[14], cospi_29_64),
+                       vmull_n_s16(x_hi[15], cospi_3_64));
+  // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+  s_lo[15] = vsubq_s32(vmull_n_s16(x_lo[14], cospi_3_64),
+                       vmull_n_s16(x_lo[15], cospi_29_64));
+  s_hi[15] = vsubq_s32(vmull_n_s16(x_hi[14], cospi_3_64),
+                       vmull_n_s16(x_hi[15], cospi_29_64));
+
+  // fdct_round_shift
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[8]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[8]);
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[9]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[9]);
+  t_lo[2] = vaddq_s32(s_lo[2], s_lo[10]);
+  t_hi[2] = vaddq_s32(s_hi[2], s_hi[10]);
+  t_lo[3] = vaddq_s32(s_lo[3], s_lo[11]);
+  t_hi[3] = vaddq_s32(s_hi[3], s_hi[11]);
+  t_lo[4] = vaddq_s32(s_lo[4], s_lo[12]);
+  t_hi[4] = vaddq_s32(s_hi[4], s_hi[12]);
+  t_lo[5] = vaddq_s32(s_lo[5], s_lo[13]);
+  t_hi[5] = vaddq_s32(s_hi[5], s_hi[13]);
+  t_lo[6] = vaddq_s32(s_lo[6], s_lo[14]);
+  t_hi[6] = vaddq_s32(s_hi[6], s_hi[14]);
+  t_lo[7] = vaddq_s32(s_lo[7], s_lo[15]);
+  t_hi[7] = vaddq_s32(s_hi[7], s_hi[15]);
+  t_lo[8] = vsubq_s32(s_lo[0], s_lo[8]);
+  t_hi[8] = vsubq_s32(s_hi[0], s_hi[8]);
+  t_lo[9] = vsubq_s32(s_lo[1], s_lo[9]);
+  t_hi[9] = vsubq_s32(s_hi[1], s_hi[9]);
+  t_lo[10] = vsubq_s32(s_lo[2], s_lo[10]);
+  t_hi[10] = vsubq_s32(s_hi[2], s_hi[10]);
+  t_lo[11] = vsubq_s32(s_lo[3], s_lo[11]);
+  t_hi[11] = vsubq_s32(s_hi[3], s_hi[11]);
+  t_lo[12] = vsubq_s32(s_lo[4], s_lo[12]);
+  t_hi[12] = vsubq_s32(s_hi[4], s_hi[12]);
+  t_lo[13] = vsubq_s32(s_lo[5], s_lo[13]);
+  t_hi[13] = vsubq_s32(s_hi[5], s_hi[13]);
+  t_lo[14] = vsubq_s32(s_lo[6], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[6], s_hi[14]);
+  t_lo[15] = vsubq_s32(s_lo[7], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[7], s_hi[15]);
+
+  t_lo[0] = vaddq_s32(t_lo[0], k__DCT_CONST_ROUNDING);
+  t_hi[0] = vaddq_s32(t_hi[0], k__DCT_CONST_ROUNDING);
+  t_lo[1] = vaddq_s32(t_lo[1], k__DCT_CONST_ROUNDING);
+  t_hi[1] = vaddq_s32(t_hi[1], k__DCT_CONST_ROUNDING);
+  t_lo[2] = vaddq_s32(t_lo[2], k__DCT_CONST_ROUNDING);
+  t_hi[2] = vaddq_s32(t_hi[2], k__DCT_CONST_ROUNDING);
+  t_lo[3] = vaddq_s32(t_lo[3], k__DCT_CONST_ROUNDING);
+  t_hi[3] = vaddq_s32(t_hi[3], k__DCT_CONST_ROUNDING);
+  t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
+  t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
+  t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
+  t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
+  t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
+  t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
+  t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
+  t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
+  t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
+  t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
+  t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
+  t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
+  t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
+  t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
+  t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
+  t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
+  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
+  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
+  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
+  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
+  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
+  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
+  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
+  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
+
+  t_lo[0] = vshrq_n_s32(t_lo[0], DCT_CONST_BITS);
+  t_hi[0] = vshrq_n_s32(t_hi[0], DCT_CONST_BITS);
+  t_lo[1] = vshrq_n_s32(t_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vshrq_n_s32(t_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vshrq_n_s32(t_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vshrq_n_s32(t_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vshrq_n_s32(t_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vshrq_n_s32(t_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+  t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+  t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+  t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+  t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+  t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+  t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+  t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+  t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  s_lo[4] = t_lo[4];
+  s_hi[4] = t_hi[4];
+  s_lo[5] = t_lo[5];
+  s_hi[5] = t_hi[5];
+  s_lo[6] = t_lo[6];
+  s_hi[6] = t_hi[6];
+  s_lo[7] = t_lo[7];
+  s_hi[7] = t_hi[7];
+  // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s_lo[8] = vaddq_s32(vmulq_n_s32(t_lo[8], cospi_4_64),
+                      vmulq_n_s32(t_lo[9], cospi_28_64));
+  s_hi[8] = vaddq_s32(vmulq_n_s32(t_hi[8], cospi_4_64),
+                      vmulq_n_s32(t_hi[9], cospi_28_64));
+  // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s_lo[9] = vsubq_s32(vmulq_n_s32(t_lo[8], cospi_28_64),
+                      vmulq_n_s32(t_lo[9], cospi_4_64));
+  s_hi[9] = vsubq_s32(vmulq_n_s32(t_hi[8], cospi_28_64),
+                      vmulq_n_s32(t_hi[9], cospi_4_64));
+  // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s_lo[10] = vaddq_s32(vmulq_n_s32(t_lo[10], cospi_20_64),
+                       vmulq_n_s32(t_lo[11], cospi_12_64));
+  s_hi[10] = vaddq_s32(vmulq_n_s32(t_hi[10], cospi_20_64),
+                       vmulq_n_s32(t_hi[11], cospi_12_64));
+  // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s_lo[11] = vsubq_s32(vmulq_n_s32(t_lo[10], cospi_12_64),
+                       vmulq_n_s32(t_lo[11], cospi_20_64));
+  s_hi[11] = vsubq_s32(vmulq_n_s32(t_hi[10], cospi_12_64),
+                       vmulq_n_s32(t_hi[11], cospi_20_64));
+  // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], -cospi_28_64),
+                       vmulq_n_s32(t_lo[13], cospi_4_64));
+  s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], -cospi_28_64),
+                       vmulq_n_s32(t_hi[13], cospi_4_64));
+  // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_4_64),
+                       vmulq_n_s32(t_lo[13], cospi_28_64));
+  s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_4_64),
+                       vmulq_n_s32(t_hi[13], cospi_28_64));
+  // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_12_64),
+                       vmulq_n_s32(t_lo[15], cospi_20_64));
+  s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_12_64),
+                       vmulq_n_s32(t_hi[15], cospi_20_64));
+  // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_20_64),
+                       vmulq_n_s32(t_lo[15], cospi_12_64));
+  s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_20_64),
+                       vmulq_n_s32(t_hi[15], cospi_12_64));
+
+  // s0 + s4
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[4]);
+  // s1 + s5
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[5]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[5]);
+  // s2 + s6
+  t_lo[2] = vaddq_s32(s_lo[2], s_lo[6]);
+  t_hi[2] = vaddq_s32(s_hi[2], s_hi[6]);
+  // s3 + s7
+  t_lo[3] = vaddq_s32(s_lo[3], s_lo[7]);
+  t_hi[3] = vaddq_s32(s_hi[3], s_hi[7]);
+  // s0 - s4
+  t_lo[4] = vsubq_s32(s_lo[0], s_lo[4]);
+  t_hi[4] = vsubq_s32(s_hi[0], s_hi[4]);
+  // s1 - s7
+  t_lo[5] = vsubq_s32(s_lo[1], s_lo[5]);
+  t_hi[5] = vsubq_s32(s_hi[1], s_hi[5]);
+  // s2 - s6
+  t_lo[6] = vsubq_s32(s_lo[2], s_lo[6]);
+  t_hi[6] = vsubq_s32(s_hi[2], s_hi[6]);
+  // s3 - s7
+  t_lo[7] = vsubq_s32(s_lo[3], s_lo[7]);
+  t_hi[7] = vsubq_s32(s_hi[3], s_hi[7]);
+  // s8 + s12
+  t_lo[8] = vaddq_s32(s_lo[8], s_lo[12]);
+  t_hi[8] = vaddq_s32(s_hi[8], s_hi[12]);
+  // s9 + s13
+  t_lo[9] = vaddq_s32(s_lo[9], s_lo[13]);
+  t_hi[9] = vaddq_s32(s_hi[9], s_hi[13]);
+  // s10 + s14
+  t_lo[10] = vaddq_s32(s_lo[10], s_lo[14]);
+  t_hi[10] = vaddq_s32(s_hi[10], s_hi[14]);
+  // s11 + s15
+  t_lo[11] = vaddq_s32(s_lo[11], s_lo[15]);
+  t_hi[11] = vaddq_s32(s_hi[11], s_hi[15]);
+  // s8 + s12
+  t_lo[12] = vsubq_s32(s_lo[8], s_lo[12]);
+  t_hi[12] = vsubq_s32(s_hi[8], s_hi[12]);
+  // s9 + s13
+  t_lo[13] = vsubq_s32(s_lo[9], s_lo[13]);
+  t_hi[13] = vsubq_s32(s_hi[9], s_hi[13]);
+  // s10 + s14
+  t_lo[14] = vsubq_s32(s_lo[10], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[10], s_hi[14]);
+  // s11 + s15
+  t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]);
+
+  t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
+  t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
+  t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
+  t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
+  t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
+  t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
+  t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
+  t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
+  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
+  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
+  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
+  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
+  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
+  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
+  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
+  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
+  t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+  t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+  t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+  t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+  t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+  t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+  t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+  t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 3
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s_lo[4] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_8_64),
+                      vmulq_n_s32(t_lo[5], cospi_24_64));
+  s_hi[4] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_8_64),
+                      vmulq_n_s32(t_hi[5], cospi_24_64));
+  // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s_lo[5] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_24_64),
+                      vmulq_n_s32(t_lo[5], -cospi_8_64));
+  s_hi[5] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_24_64),
+                      vmulq_n_s32(t_hi[5], -cospi_8_64));
+  // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s_lo[6] = vaddq_s32(vmulq_n_s32(t_lo[6], -cospi_24_64),
+                      vmulq_n_s32(t_lo[7], cospi_8_64));
+  s_hi[6] = vaddq_s32(vmulq_n_s32(t_hi[6], -cospi_24_64),
+                      vmulq_n_s32(t_hi[7], cospi_8_64));
+  // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s_lo[7] = vaddq_s32(vmulq_n_s32(t_lo[6], cospi_8_64),
+                      vmulq_n_s32(t_lo[7], cospi_24_64));
+  s_hi[7] = vaddq_s32(vmulq_n_s32(t_hi[6], cospi_8_64),
+                      vmulq_n_s32(t_hi[7], cospi_24_64));
+  s_lo[8] = t_lo[8];
+  s_hi[8] = t_hi[8];
+  s_lo[9] = t_lo[9];
+  s_hi[9] = t_hi[9];
+  s_lo[10] = t_lo[10];
+  s_hi[10] = t_hi[10];
+  s_lo[11] = t_lo[11];
+  s_hi[11] = t_hi[11];
+  // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_8_64),
+                       vmulq_n_s32(t_lo[13], cospi_24_64));
+  s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_8_64),
+                       vmulq_n_s32(t_hi[13], cospi_24_64));
+  // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_24_64),
+                       vmulq_n_s32(t_lo[13], -cospi_8_64));
+  s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_24_64),
+                       vmulq_n_s32(t_hi[13], -cospi_8_64));
+  // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_24_64),
+                       vmulq_n_s32(t_lo[15], cospi_8_64));
+  s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_24_64),
+                       vmulq_n_s32(t_hi[15], cospi_8_64));
+  // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_8_64),
+                       vmulq_n_s32(t_lo[15], cospi_24_64));
+  s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_8_64),
+                       vmulq_n_s32(t_hi[15], cospi_24_64));
+
+  // s0 + s4
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
+  // s0 - s4
+  t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+  t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+  t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
+  // s4 + s6
+  t_lo[4] = vaddq_s32(s_lo[4], s_lo[6]);
+  t_hi[4] = vaddq_s32(s_hi[4], s_hi[6]);
+  // s5 + s7
+  t_lo[5] = vaddq_s32(s_lo[5], s_lo[7]);
+  t_hi[5] = vaddq_s32(s_hi[5], s_hi[7]);
+  // s4 - s6
+  t_lo[6] = vsubq_s32(s_lo[4], s_lo[6]);
+  t_hi[6] = vsubq_s32(s_hi[4], s_hi[6]);
+  // s5 - s7
+  t_lo[7] = vsubq_s32(s_lo[5], s_lo[7]);
+  t_hi[7] = vsubq_s32(s_hi[5], s_hi[7]);
+  // s8 + s10
+  t_lo[8] = vaddq_s32(s_lo[8], s_lo[10]);
+  t_hi[8] = vaddq_s32(s_hi[8], s_hi[10]);
+  // s9 + s11
+  t_lo[9] = vaddq_s32(s_lo[9], s_lo[11]);
+  t_hi[9] = vaddq_s32(s_hi[9], s_hi[11]);
+  // s8 - s10
+  t_lo[10] = vsubq_s32(s_lo[8], s_lo[10]);
+  t_hi[10] = vsubq_s32(s_hi[8], s_hi[10]);
+  // s9 - s11
+  t_lo[11] = vsubq_s32(s_lo[9], s_lo[11]);
+  t_hi[11] = vsubq_s32(s_hi[9], s_hi[11]);
+  // s12 + s14
+  t_lo[12] = vaddq_s32(s_lo[12], s_lo[14]);
+  t_hi[12] = vaddq_s32(s_hi[12], s_hi[14]);
+  // s13 + s15
+  t_lo[13] = vaddq_s32(s_lo[13], s_lo[15]);
+  t_hi[13] = vaddq_s32(s_hi[13], s_hi[15]);
+  // s12 - s14
+  t_lo[14] = vsubq_s32(s_lo[12], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[12], s_hi[14]);
+  // s13 - s15
+  t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]);
+
+  t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
+  t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
+  t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
+  t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
+  t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
+  t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
+  t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
+  t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
+  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
+  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
+  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
+  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
+  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
+  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
+  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
+  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
+  t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 4
+  // s2 = (-cospi_16_64) * (x2 + x3);
+  s_lo[2] = vmulq_n_s32(vaddq_s32(t_lo[2], t_lo[3]), -cospi_16_64);
+  s_hi[2] = vmulq_n_s32(vaddq_s32(t_hi[2], t_hi[3]), -cospi_16_64);
+  // s3 = cospi_16_64 * (x2 - x3);
+  s_lo[3] = vmulq_n_s32(vsubq_s32(t_lo[2], t_lo[3]), cospi_16_64);
+  s_hi[3] = vmulq_n_s32(vsubq_s32(t_hi[2], t_hi[3]), cospi_16_64);
+  // s6 = cospi_16_64 * (x6 + x7);
+  s_lo[6] = vmulq_n_s32(vaddq_s32(t_lo[6], t_lo[7]), cospi_16_64);
+  s_hi[6] = vmulq_n_s32(vaddq_s32(t_hi[6], t_hi[7]), cospi_16_64);
+  // s7 = cospi_16_64 * (-x6 + x7);
+  s_lo[7] = vmulq_n_s32(vsubq_s32(t_lo[7], t_lo[6]), cospi_16_64);
+  s_hi[7] = vmulq_n_s32(vsubq_s32(t_hi[7], t_hi[6]), cospi_16_64);
+  // s10 = cospi_16_64 * (x10 + x11);
+  s_lo[10] = vmulq_n_s32(vaddq_s32(t_lo[10], t_lo[11]), cospi_16_64);
+  s_hi[10] = vmulq_n_s32(vaddq_s32(t_hi[10], t_hi[11]), cospi_16_64);
+  // s11 = cospi_16_64 * (-x10 + x11);
+  s_lo[11] = vmulq_n_s32(vsubq_s32(t_lo[11], t_lo[10]), cospi_16_64);
+  s_hi[11] = vmulq_n_s32(vsubq_s32(t_hi[11], t_hi[10]), cospi_16_64);
+  // s14 = (-cospi_16_64) * (x14 + x15);
+  s_lo[14] = vmulq_n_s32(vaddq_s32(t_lo[14], t_lo[15]), -cospi_16_64);
+  s_hi[14] = vmulq_n_s32(vaddq_s32(t_hi[14], t_hi[15]), -cospi_16_64);
+  // s15 = cospi_16_64 * (x14 - x15);
+  s_lo[15] = vmulq_n_s32(vsubq_s32(t_lo[14], t_lo[15]), cospi_16_64);
+  s_hi[15] = vmulq_n_s32(vsubq_s32(t_hi[14], t_hi[15]), cospi_16_64);
+
+  // final fdct_round_shift
+  t_lo[2] = vaddq_s32(s_lo[2], k__DCT_CONST_ROUNDING);
+  t_hi[2] = vaddq_s32(s_hi[2], k__DCT_CONST_ROUNDING);
+  t_lo[3] = vaddq_s32(s_lo[3], k__DCT_CONST_ROUNDING);
+  t_hi[3] = vaddq_s32(s_hi[3], k__DCT_CONST_ROUNDING);
+  t_lo[6] = vaddq_s32(s_lo[6], k__DCT_CONST_ROUNDING);
+  t_hi[6] = vaddq_s32(s_hi[6], k__DCT_CONST_ROUNDING);
+  t_lo[7] = vaddq_s32(s_lo[7], k__DCT_CONST_ROUNDING);
+  t_hi[7] = vaddq_s32(s_hi[7], k__DCT_CONST_ROUNDING);
+  t_lo[10] = vaddq_s32(s_lo[10], k__DCT_CONST_ROUNDING);
+  t_hi[10] = vaddq_s32(s_hi[10], k__DCT_CONST_ROUNDING);
+  t_lo[11] = vaddq_s32(s_lo[11], k__DCT_CONST_ROUNDING);
+  t_hi[11] = vaddq_s32(s_hi[11], k__DCT_CONST_ROUNDING);
+  t_lo[14] = vaddq_s32(s_lo[14], k__DCT_CONST_ROUNDING);
+  t_hi[14] = vaddq_s32(s_hi[14], k__DCT_CONST_ROUNDING);
+  t_lo[15] = vaddq_s32(s_lo[15], k__DCT_CONST_ROUNDING);
+  t_hi[15] = vaddq_s32(s_hi[15], k__DCT_CONST_ROUNDING);
+
+  x_lo[2] = vshrn_n_s32(t_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vshrn_n_s32(t_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vshrn_n_s32(t_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vshrn_n_s32(t_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vshrn_n_s32(t_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vshrn_n_s32(t_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vshrn_n_s32(t_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vshrn_n_s32(t_hi[7], DCT_CONST_BITS);
+  x_lo[10] = vshrn_n_s32(t_lo[10], DCT_CONST_BITS);
+  x_hi[10] = vshrn_n_s32(t_hi[10], DCT_CONST_BITS);
+  x_lo[11] = vshrn_n_s32(t_lo[11], DCT_CONST_BITS);
+  x_hi[11] = vshrn_n_s32(t_hi[11], DCT_CONST_BITS);
+  x_lo[14] = vshrn_n_s32(t_lo[14], DCT_CONST_BITS);
+  x_hi[14] = vshrn_n_s32(t_hi[14], DCT_CONST_BITS);
+  x_lo[15] = vshrn_n_s32(t_lo[15], DCT_CONST_BITS);
+  x_hi[15] = vshrn_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly
+  x_lo[0] = vmovn_s32(t_lo[0]);
+  x_hi[0] = vmovn_s32(t_hi[0]);
+  x_lo[1] = vmovn_s32(t_lo[1]);
+  x_hi[1] = vmovn_s32(t_hi[1]);
+  x_lo[4] = vmovn_s32(t_lo[4]);
+  x_hi[4] = vmovn_s32(t_hi[4]);
+  x_lo[5] = vmovn_s32(t_lo[5]);
+  x_hi[5] = vmovn_s32(t_hi[5]);
+  x_lo[8] = vmovn_s32(t_lo[8]);
+  x_hi[8] = vmovn_s32(t_hi[8]);
+  x_lo[9] = vmovn_s32(t_lo[9]);
+  x_hi[9] = vmovn_s32(t_hi[9]);
+  x_lo[12] = vmovn_s32(t_lo[12]);
+  x_hi[12] = vmovn_s32(t_hi[12]);
+  x_lo[13] = vmovn_s32(t_lo[13]);
+  x_hi[13] = vmovn_s32(t_hi[13]);
+
+  in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+  in[1] = vnegq_s16(vcombine_s16(x_lo[8], x_hi[8]));
+  in[2] = vcombine_s16(x_lo[12], x_hi[12]);
+  in[3] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+  in[4] = vcombine_s16(x_lo[6], x_hi[6]);
+  in[5] = vcombine_s16(x_lo[14], x_hi[14]);
+  in[6] = vcombine_s16(x_lo[10], x_hi[10]);
+  in[7] = vcombine_s16(x_lo[2], x_hi[2]);
+  in[8] = vcombine_s16(x_lo[3], x_hi[3]);
+  in[9] = vcombine_s16(x_lo[11], x_hi[11]);
+  in[10] = vcombine_s16(x_lo[15], x_hi[15]);
+  in[11] = vcombine_s16(x_lo[7], x_hi[7]);
+  in[12] = vcombine_s16(x_lo[5], x_hi[5]);
+  in[13] = vnegq_s16(vcombine_s16(x_lo[13], x_hi[13]));
+  in[14] = vcombine_s16(x_lo[9], x_hi[9]);
+  in[15] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
+}
+
+static void fdct16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+  // Left half.
+  fdct16_8col(in0);
+  // Right half.
+  fdct16_8col(in1);
+  transpose_s16_16x16(in0, in1);
+}
+
+static void fadst16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+  transpose_s16_16x16(in0, in1);
+}
+
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  int16x8_t in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct16x16_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fdct16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+  }
+}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 38e99165af..92a7fddb9d 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -127,6 +127,7 @@ endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c

From 2c32425851cb89a1623ac7f3cf3d7bbba7aa32c6 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Wed, 30 Mar 2022 15:47:17 -0700
Subject: [PATCH 233/926] L2E: Make SimpleEncode take vp9 level as an input

Level conformance is standadized in vp9.
If a specific target level is set, the vp9 encoder is required to
produce conformant bitstream with limit on frame size, rate,
min alt-ref distance, etc.

This change makes the SimpleEncode environment take the target level
as an input.

To make existing tests pass, we set the level to 0.

Change-Id: Ia35224f75c2fe50338b5b86a50c84355f5daf6fd
---
 test/simple_encode_test.cc | 37 +++++++++++++++++++------------------
 vp9/simple_encode.cc       | 26 ++++++++++++++------------
 vp9/simple_encode.h        | 24 +++++++++++++++++++++++-
 vp9/vp9_cx_iface.c         |  6 +++++-
 vp9/vp9_cx_iface.h         |  1 +
 5 files changed, 62 insertions(+), 32 deletions(-)

diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc
index 03e28e3387..01fc258566 100644
--- a/test/simple_encode_test.cc
+++ b/test/simple_encode_test.cc
@@ -37,13 +37,14 @@ class SimpleEncodeTest : public ::testing::Test {
   const int frame_rate_den_ = 1;
   const int target_bitrate_ = 1000;
   const int num_frames_ = 17;
+  const int target_level_ = LEVEL_UNKNOWN;
   const std::string in_file_path_str_ =
       libvpx_test::GetDataPath() + "/bus_352x288_420_f20_b8.yuv";
 };
 
 TEST_F(SimpleEncodeTest, ComputeFirstPassStats) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   std::vector<std::vector<double>> frame_stats =
@@ -64,7 +65,7 @@ TEST_F(SimpleEncodeTest, ComputeFirstPassStats) {
 
 TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   std::vector<std::vector<MotionVectorInfo>> fps_motion_vectors =
@@ -86,7 +87,7 @@ TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) {
 
 TEST_F(SimpleEncodeTest, GetCodingFrameNum) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -95,7 +96,7 @@ TEST_F(SimpleEncodeTest, GetCodingFrameNum) {
 
 TEST_F(SimpleEncodeTest, EncodeFrame) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -138,7 +139,7 @@ TEST_F(SimpleEncodeTest, EncodeFrame) {
 
 TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   std::vector<int> key_frame_map = simple_encode.ObserveKeyFrameMap();
@@ -167,7 +168,7 @@ TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) {
 
 TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -205,7 +206,7 @@ TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) {
 
 TEST_F(SimpleEncodeTest, EncodeFrameWithQuantizeIndex) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -237,7 +238,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest) {
     // The first encode.
     SimpleEncode simple_encode(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
     simple_encode.StartEncode();
@@ -257,7 +258,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest) {
     // The second encode with quantize index got from the first encode.
     SimpleEncode simple_encode(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
     EXPECT_EQ(static_cast<size_t>(num_coding_frames),
@@ -286,7 +287,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest2) {
   const int num_units_4x4 = num_rows_4x4 * num_cols_4x4;
   // The first encode.
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -309,7 +310,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest2) {
   // The second encode.
   SimpleEncode simple_encode_2(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
   simple_encode_2.ComputeFirstPassStats();
   const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum();
   simple_encode_2.StartEncode();
@@ -357,7 +358,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest3) {
   const int num_units_4x4 = num_rows_4x4 * num_cols_4x4;
   // The first encode.
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -377,7 +378,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest3) {
   // The second encode.
   SimpleEncode simple_encode_2(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
   simple_encode_2.ComputeFirstPassStats();
   const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum();
   simple_encode_2.StartEncode();
@@ -417,7 +418,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) {
     // The first encode.
     SimpleEncode simple_encode(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     simple_encode.StartEncode();
 
@@ -449,7 +450,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) {
     // The external arfs are the same as the first encode.
     SimpleEncode simple_encode(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size());
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -471,7 +472,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) {
 
 TEST_F(SimpleEncodeTest, SetExternalGroupOfPicturesMap) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
 
@@ -541,7 +542,7 @@ TEST_F(SimpleEncodeTest, GetEncodeFrameInfo) {
   // Makes sure that the encode_frame_info obtained from GetEncodeFrameInfo()
   // matches the counterpart in encode_frame_result obtained from EncodeFrame()
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -558,7 +559,7 @@ TEST_F(SimpleEncodeTest, GetEncodeFrameInfo) {
 
 TEST_F(SimpleEncodeTest, GetFramePixelCount) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   EXPECT_EQ(simple_encode.GetFramePixelCount(),
             static_cast<uint64_t>(width_ * height_ * 3 / 2));
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 6ba37a321c..1a0ada119f 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -782,11 +782,12 @@ static void UpdateEncodeConfig(const EncodeConfig &config,
 
 static VP9EncoderConfig GetEncodeConfig(
     int frame_width, int frame_height, vpx_rational_t frame_rate,
-    int target_bitrate, int encode_speed, vpx_enc_pass enc_pass,
+    int target_bitrate, int encode_speed, int target_level,
+    vpx_enc_pass enc_pass,
     const std::vector<EncodeConfig> &encode_config_list) {
-  VP9EncoderConfig oxcf =
-      vp9_get_encoder_config(frame_width, frame_height, frame_rate,
-                             target_bitrate, encode_speed, enc_pass);
+  VP9EncoderConfig oxcf = vp9_get_encoder_config(
+      frame_width, frame_height, frame_rate, target_bitrate, encode_speed,
+      target_level, enc_pass);
   for (const auto &config : encode_config_list) {
     UpdateEncodeConfig(config, &oxcf);
   }
@@ -799,7 +800,7 @@ static VP9EncoderConfig GetEncodeConfig(
 
 SimpleEncode::SimpleEncode(int frame_width, int frame_height,
                            int frame_rate_num, int frame_rate_den,
-                           int target_bitrate, int num_frames,
+                           int target_bitrate, int num_frames, int target_level,
                            const char *infile_path, const char *outfile_path) {
   impl_ptr_ = std::unique_ptr<EncodeImpl>(new EncodeImpl());
   frame_width_ = frame_width;
@@ -809,6 +810,7 @@ SimpleEncode::SimpleEncode(int frame_width, int frame_height,
   target_bitrate_ = target_bitrate;
   num_frames_ = num_frames;
   encode_speed_ = 0;
+  target_level_ = target_level;
 
   frame_coding_index_ = 0;
   show_frame_count_ = 0;
@@ -860,9 +862,9 @@ StatusCode SimpleEncode::DumpEncodeConfigs(int pass, FILE *fp) {
   }
   const vpx_rational_t frame_rate =
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
-  const VP9EncoderConfig oxcf =
-      GetEncodeConfig(frame_width_, frame_height_, frame_rate, target_bitrate_,
-                      encode_speed_, enc_pass, impl_ptr_->encode_config_list);
+  const VP9EncoderConfig oxcf = GetEncodeConfig(
+      frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
+      target_level_, enc_pass, impl_ptr_->encode_config_list);
   vp9_dump_encoder_config(&oxcf, fp);
   return StatusOk;
 }
@@ -872,7 +874,7 @@ void SimpleEncode::ComputeFirstPassStats() {
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
-      VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list);
+      target_level_, VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list);
   impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
   struct lookahead_ctx *lookahead = impl_ptr_->cpi->lookahead;
   int i;
@@ -1038,7 +1040,7 @@ void SimpleEncode::StartEncode() {
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
-      VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
 
   vpx_fixed_buf_t stats;
   stats.buf = GetVectorData(impl_ptr_->first_pass_stats);
@@ -1266,7 +1268,7 @@ int SimpleEncode::GetCodingFrameNum() const {
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
-      VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
   FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
   fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
@@ -1285,7 +1287,7 @@ std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
-      VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
   TWO_PASS twopass;
   fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h
index 8ec7069e83..7920e95ee9 100644
--- a/vp9/simple_encode.h
+++ b/vp9/simple_encode.h
@@ -44,6 +44,26 @@ enum RefFrameType {
   kRefFrameTypeNone = -1,
 };
 
+enum VP9_LEVEL {
+  LEVEL_UNKNOWN = 0,
+  LEVEL_AUTO = 1,
+  LEVEL_1 = 10,
+  LEVEL_1_1 = 11,
+  LEVEL_2 = 20,
+  LEVEL_2_1 = 21,
+  LEVEL_3 = 30,
+  LEVEL_3_1 = 31,
+  LEVEL_4 = 40,
+  LEVEL_4_1 = 41,
+  LEVEL_5 = 50,
+  LEVEL_5_1 = 51,
+  LEVEL_5_2 = 52,
+  LEVEL_6 = 60,
+  LEVEL_6_1 = 61,
+  LEVEL_6_2 = 62,
+  LEVEL_MAX = 255
+};
+
 enum GopMapFlag {
   kGopMapFlagStart =
       1 << 0,  // Indicate this location is the start of a group of pictures.
@@ -343,7 +363,8 @@ class SimpleEncode {
   // format.
   SimpleEncode(int frame_width, int frame_height, int frame_rate_num,
                int frame_rate_den, int target_bitrate, int num_frames,
-               const char *infile_path, const char *outfile_path = nullptr);
+               int target_level, const char *infile_path,
+               const char *outfile_path = nullptr);
   ~SimpleEncode();
   SimpleEncode(SimpleEncode &) = delete;
   SimpleEncode &operator=(const SimpleEncode &) = delete;
@@ -513,6 +534,7 @@ class SimpleEncode {
   int target_bitrate_;
   int num_frames_;
   int encode_speed_;
+  int target_level_;
 
   std::FILE *in_file_;
   std::FILE *out_file_;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 76274437c6..b809ab3e6f 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -2143,6 +2143,7 @@ static vp9_extracfg get_extra_cfg() {
 VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
                                         vpx_rational_t frame_rate,
                                         int target_bitrate, int encode_speed,
+                                        int target_level,
                                         vpx_enc_pass enc_pass) {
   /* This function will generate the same VP9EncoderConfig used by the
    * vpxenc command given below.
@@ -2154,6 +2155,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
    * FPS:     frame_rate
    * BITRATE: target_bitrate
    * CPU_USED:encode_speed
+   * TARGET_LEVEL: target_level
    *
    * INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig
    *
@@ -2166,6 +2168,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
    * FPS=30/1
    * LIMIT=150
    * CPU_USED=0
+   * TARGET_LEVEL=0
    * ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS
    * --lag-in-frames=25 \
    *  --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \
@@ -2174,7 +2177,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
    *  --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \
    *  --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 \
    *  --frame-parallel=0 --tile-columns=0 --cpu-used=0 --end-usage=vbr \
-   *  --target-bitrate=$BITRATE -o $OUTPUT $INPUT
+   *  --target-bitrate=$BITRATE --target-level=0 -o $OUTPUT $INPUT
    */
 
   VP9EncoderConfig oxcf;
@@ -2192,6 +2195,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
   oxcf.frame_parallel_decoding_mode = 0;
   oxcf.two_pass_vbrmax_section = 150;
   oxcf.speed = abs(encode_speed);
+  oxcf.target_level = target_level;
   return oxcf;
 }
 
diff --git a/vp9/vp9_cx_iface.h b/vp9/vp9_cx_iface.h
index 01338adb4e..f2de8507ff 100644
--- a/vp9/vp9_cx_iface.h
+++ b/vp9/vp9_cx_iface.h
@@ -20,6 +20,7 @@ extern "C" {
 VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
                                         vpx_rational_t frame_rate,
                                         int target_bitrate, int encode_speed,
+                                        int target_level,
                                         vpx_enc_pass enc_pass);
 
 void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp);

From 2200039d33c49a9f7a5c438656df143755b022c4 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Wed, 30 Mar 2022 14:57:46 +0900
Subject: [PATCH 234/926] quantize: replace highbd versions

The optimized quantize functions were already built to handle
highbd values. The only difference is the clamping. All highbd
functions expand to 32bits when running in highbd mode.

Removes vpx_highbd_quantize_32x32_sse2 as it is slower than the
C version in the worst case.

Bug: webm:1586
Change-Id: I49bf8a6a2041f78450bf43a4f655c67656b0f8d9
---
 test/vp9_quantize_test.cc                 |  42 +++---
 vp9/encoder/vp9_encodemb.c                |  48 +++----
 vp9/encoder/vp9_quantize.c                |   8 --
 vpx_dsp/quantize.h                        |  16 +++
 vpx_dsp/vpx_dsp.mk                        |   3 -
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |   8 --
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 152 ----------------------
 7 files changed, 66 insertions(+), 211 deletions(-)
 delete mode 100644 vpx_dsp/x86/highbd_quantize_intrin_sse2.c

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index d54f1bc9cd..5773cd9835 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -30,6 +30,7 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_dsp/quantize.h"
 
 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
@@ -464,22 +465,12 @@ using std::make_tuple;
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(
-        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
-
+    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                                 VPX_BITS_10, 16, false),
+                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                                 VPX_BITS_12, 16, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
@@ -519,6 +510,24 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10,
+                   16, false),
+        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12,
+                   16, false),
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+                   VPX_BITS_12, 32, false)));
+
+#else
 INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
                          ::testing::Values(make_tuple(&vpx_quantize_b_avx,
                                                       &vpx_quantize_b_c,
@@ -526,6 +535,7 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
                                            make_tuple(&vpx_quantize_b_32x32_avx,
                                                       &vpx_quantize_b_32x32_c,
                                                       VPX_BITS_8, 32, false)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9dcf..e708555f89 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -511,28 +511,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(
-            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                             scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
         break;
     }
     return;
@@ -857,9 +857,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(
-              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+          vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                               p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                               eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -876,9 +876,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                         scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -896,9 +896,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+                         qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                         scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -917,9 +917,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+                         qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                         scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 9058997b0f..1c401e96b4 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -164,14 +164,6 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
     return;
   }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin,
-                          p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
-                          pd->dequant, &p->eobs[block], scan, iscan);
-    return;
-  }
-#endif
   vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round,
                  p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
                  &p->eobs[block], scan, iscan);
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 8e138445e2..0fcd77941b 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -37,6 +37,22 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant, uint16_t *eob_ptr);
+
+// Only used for reference. The optimized versions can handle HBD.
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan);
+
+void vpx_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
 #endif
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a880e1d285..b930fbd0a3 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -318,9 +318,6 @@ DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
-endif
 
 # avg
 DSP_SRCS-yes           += avg.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 372903aff2..63097b0b6c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -714,14 +714,6 @@ ()
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
-
-  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2/;
-
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
-  }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
deleted file mode 100644
index 4535a0f7a2..0000000000
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
-  __m128i zbins[2];
-  __m128i nzbins[2];
-
-  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
-                           (int)zbin_ptr[0]);
-  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
-
-  nzbins[0] = _mm_setzero_si128();
-  nzbins[1] = _mm_setzero_si128();
-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
-  (void)scan;
-
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = ((int)count / 4) - 1; i >= 0; i--) {
-    __m128i coeffs, cmp1, cmp2;
-    int test;
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-    cmp1 = _mm_and_si128(cmp1, cmp2);
-    test = _mm_movemask_epi8(cmp1);
-    if (test == 0xffff)
-      non_zero_regs--;
-    else
-      break;
-  }
-
-  // Quantization pass:
-  for (i = 0; i < non_zero_regs; i++) {
-    __m128i coeffs, coeffs_sign, tmp1, tmp2;
-    int test;
-    int abs_coeff[4];
-    int coeff_sign[4];
-
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    coeffs_sign = _mm_srai_epi32(coeffs, 31);
-    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-    tmp1 = _mm_or_si128(tmp1, tmp2);
-    test = _mm_movemask_epi8(tmp1);
-    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
-    for (j = 0; j < 4; j++) {
-      if (test & (1 << (4 * j))) {
-        int k = 4 * i + j;
-        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
-        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-      }
-    }
-  }
-  *eob_ptr = eob_i + 1;
-}
-
-void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  __m128i zbins[2];
-  __m128i nzbins[2];
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
-  (void)scan;
-
-  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
-  zbins[1] = _mm_set1_epi32(zbin1_tmp);
-
-  nzbins[0] = _mm_setzero_si128();
-  nzbins[1] = _mm_setzero_si128();
-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = 0; i < n_coeffs / 4; i++) {
-    __m128i coeffs, cmp1, cmp2;
-    int test;
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-    cmp1 = _mm_and_si128(cmp1, cmp2);
-    test = _mm_movemask_epi8(cmp1);
-    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-  }
-
-  // Quantization pass: only process the coefficients selected in
-  // pre-scan pass. Note: idx can be zero.
-  for (i = 0; i < idx; i++) {
-    const int rc = idx_arr[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif

From 3c98caa6a4eea59da3c5b37b128e18f16b722080 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Thu, 31 Mar 2022 10:43:29 +0900
Subject: [PATCH 235/926] subpel variance: add speed test

Was used to verify assembly speed versus an attempt to rewrite
in intrinsics.

Change-Id: I011fe5494334b8fcda04b9d54c6093dbcfc55710
---
 test/variance_test.cc | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 1b76b20419..660bbd0ed7 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -596,6 +596,7 @@ class SubpelVarianceTest
  protected:
   void RefTest();
   void ExtremeRefTest();
+  void SpeedTest();
 
   ACMRandom rnd_;
   uint8_t *src_;
@@ -681,6 +682,37 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
   }
 }
 
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+  // The only interesting points are 0, 4, and anything else. To make the loops
+  // simple we will use 0, 2 and 4.
+  for (int x = 0; x <= 4; x += 2) {
+    for (int y = 0; y <= 4; y += 2) {
+      if (!use_high_bit_depth()) {
+        memset(src_, 25, block_size());
+        memset(ref_, 50, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), 25, block_size());
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 50, block_size());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse;
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int i = 0; i < 1000000000 / block_size(); ++i) {
+        const uint32_t variance =
+            params_.func(ref_, width() + 1, x, y, src_, width(), &sse);
+        (void)variance;
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      printf("SubpelVariance %dx%d xoffset: %d yoffset: %d time: %5d ms\n",
+             width(), height(), x, y, elapsed_time / 1000);
+    }
+  }
+}
+
 template <>
 void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
   for (int x = 0; x < 8; ++x) {
@@ -736,6 +768,7 @@ TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,

From e6ede58a5a6cd34b82321c1b2c36ec14984e6ecd Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Mon, 4 Nov 2019 15:58:07 -0600
Subject: [PATCH 236/926] remove unused vp8_encode_intra parameter

Follow it up and also remove it from other functions.

BUG=webm:1612

Change-Id: I9d3cb785ab0d68c6fcae185043c896d8a135e284
---
 vp8/encoder/encodeframe.c | 20 ++++++--------------
 vp8/encoder/encodeintra.c |  3 +--
 vp8/encoder/encodeintra.h |  2 +-
 vp8/encoder/firstpass.c   |  2 +-
 4 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 4df35f6edb..620107500a 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -69,10 +69,9 @@ static const unsigned char VP8_VAR_OFFS[16] = { 128, 128, 128, 128, 128, 128,
                                                 128, 128, 128, 128 };
 
 /* Original activity measure from Tim T's code. */
-static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
+static unsigned int tt_activity_measure(MACROBLOCK *x) {
   unsigned int act;
   unsigned int sse;
-  (void)cpi;
   /* TODO: This could also be done over smaller areas (8x8), but that would
    *  require extensive changes elsewhere, as lambda is assumed to be fixed
    *  over an entire MB in most of the code.
@@ -90,28 +89,21 @@ static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
   return act;
 }
 
-/* Stub for alternative experimental activity measures. */
-static unsigned int alt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x,
-                                         int use_dc_pred) {
-  return vp8_encode_intra(cpi, x, use_dc_pred);
-}
-
 /* Measure the activity of the current macroblock
  * What we measure here is TBD so abstracted to this function
  */
 #define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(VP8_COMP *cpi, MACROBLOCK *x,
-                                        int mb_row, int mb_col) {
+static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) {
   unsigned int mb_activity;
 
   if (ALT_ACT_MEASURE) {
     int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
-    /* Or use and alternative. */
-    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
+    /* Or use an alternative. */
+    mb_activity = vp8_encode_intra(x, use_dc_pred);
   } else {
     /* Original activity measure from Tim T's code. */
-    mb_activity = tt_activity_measure(cpi, x);
+    mb_activity = tt_activity_measure(x);
   }
 
   if (mb_activity < VP8_ACTIVITY_AVG_MIN) mb_activity = VP8_ACTIVITY_AVG_MIN;
@@ -264,7 +256,7 @@ static void build_activity_map(VP8_COMP *cpi) {
       vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
       /* measure activity */
-      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
+      mb_activity = mb_activity_measure(x, mb_row, mb_col);
 
       /* Keep frame sum */
       activity_sum += mb_activity;
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index f89e7cb1fa..7d448c0ea0 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -18,10 +18,9 @@
 #include "vp8/common/invtrans.h"
 #include "encodeintra.h"
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) {
+int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred) {
   int i;
   int intra_pred_var = 0;
-  (void)cpi;
 
   if (use_dc_pred) {
     x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
diff --git a/vp8/encoder/encodeintra.h b/vp8/encoder/encodeintra.h
index 021dc5ed76..9a378abf49 100644
--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h
@@ -16,7 +16,7 @@
 extern "C" {
 #endif
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
+int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred);
 void vp8_encode_intra16x16mby(MACROBLOCK *x);
 void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
 void vp8_encode_intra4x4mby(MACROBLOCK *mb);
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 981c0fde35..14164ebc51 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -567,7 +567,7 @@ void vp8_first_pass(VP8_COMP *cpi) {
       vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
       /* do intra 16x16 prediction */
-      this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+      this_error = vp8_encode_intra(x, use_dc_pred);
 
       /* "intrapenalty" below deals with situations where the intra
        * and inter error scores are very low (eg a plain black frame)

From 89cfe3835c47dabf77d38edb3af190155984fa9a Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Wed, 30 Mar 2022 15:33:40 +0900
Subject: [PATCH 237/926] quantize: remove highbd version

The only difference between the code is the clamp. For
8 bit it is purely an optimization. The values outside
this range will still saturate.

Change-Id: I2a770b140690d99e151b00957789bd72f7a11e13
---
 test/vp9_quantize_test.cc |  89 +++++++++++++++++++++----
 vpx_dsp/quantize.c        | 132 ++++++++------------------------------
 vpx_dsp/quantize.h        |  17 +----
 3 files changed, 104 insertions(+), 134 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 5773cd9835..fcc7ff99ec 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -30,7 +30,6 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
-#include "vpx_dsp/quantize.h"
 
 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
@@ -467,10 +466,13 @@ INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                      make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
                                  VPX_BITS_10, 16, false),
-                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                                 VPX_BITS_12, 16, false)));
+                      make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                 VPX_BITS_12, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
@@ -484,6 +486,28 @@ INSTANTIATE_TEST_SUITE_P(
 
 #if HAVE_SSSE3
 #if VPX_ARCH_X86_64
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_12, 32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+                   &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32,
+                   true)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
@@ -497,6 +521,24 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true)));
+#endif  // #CONFIG_VP9_HIGHBITDEPTH
+#else
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_12, 32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
@@ -505,7 +547,7 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&vpx_quantize_b_32x32_ssse3,
                                  &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
                                  false)));
-
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // VPX_ARCH_X86_64
 #endif  // HAVE_SSSE3
 
@@ -516,15 +558,15 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
-        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10,
-                   16, false),
-        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12,
-                   16, false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+        make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c,
                    VPX_BITS_8, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c,
                    VPX_BITS_10, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c,
                    VPX_BITS_12, 32, false)));
 
 #else
@@ -547,6 +589,28 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_12, 32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
@@ -560,6 +624,7 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                                  VPX_BITS_8, 32, true)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 5d6ba64a8a..c29b99bb0c 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -147,66 +147,25 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
     if (abs_coeff >= zbins[rc != 0]) {
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-             quant_shift_ptr[rc != 0]) >>
-            16;  // quantization
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]);
-
-      if (tmp) eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const int coeff = coeff_ptr[rc];
-
-    if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-      non_zero_count--;
-    else
-      break;
-  }
-
-  // Quantization pass: All coefficients with index >= zero_flag are
-  // skippable. Note: zero_flag can be zero.
-  for (i = 0; i < non_zero_count; i++) {
-    const int rc = scan[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    if (abs_coeff >= zbins[rc != 0]) {
+      // High bit depth configurations do not clamp to INT16.
       const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
       const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
       const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+#else
+      const int tmp =
+          clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      const int abs_qcoeff = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                              quant_shift_ptr[rc != 0]) >>
+                             16;  // quantization
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]);
       if (abs_qcoeff) eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
-#endif
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -243,15 +202,23 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const int rc = scan[idx_arr[i]];
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = (coeff >> 31);
-    int tmp;
-    int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-    tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-           quant_shift_ptr[rc != 0]) >>
-          15;
-
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int abs_qcoeff = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+#if CONFIG_VP9_HIGHBITDEPTH
+    // High bit depth configurations do not clamp to INT16.
+    {
+      const int64_t tmp =
+          ((abs_qcoeff * quant_ptr[rc != 0]) >> 16) + abs_qcoeff;
+      abs_qcoeff = (int)((tmp * quant_shift_ptr[rc != 0]) >> 15);
+    }
+#else
+    abs_qcoeff = clamp(abs_qcoeff, INT16_MIN, INT16_MAX);
+    abs_qcoeff = ((((abs_qcoeff * quant_ptr[rc != 0]) >> 16) + abs_qcoeff) *
+                  quant_shift_ptr[rc != 0]) >>
+                 15;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    qcoeff_ptr[rc] = (abs_qcoeff ^ coeff_sign) - coeff_sign;
 #if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
     // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
     // truncating with a cast, saturate the value. This is easier to implement
@@ -262,54 +229,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
 #endif  // VPX_ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
 
-    if (tmp) eob = idx_arr[i];
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = 0; i < n_coeffs; i++) {
-    const int rc = scan[i];
-    const int coeff = coeff_ptr[rc];
-
-    // If the coefficient is out of the base ZBIN range, keep it for
-    // quantization.
-    if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
-  }
-
-  // Quantization pass: only process the coefficients selected in
-  // pre-scan pass. Note: idx can be zero.
-  for (i = 0; i < idx; i++) {
-    const int rc = scan[idx_arr[i]];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = idx_arr[i];
   }
   *eob_ptr = eob + 1;
 }
-#endif
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 0fcd77941b..9ac1a47418 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -38,22 +38,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant, uint16_t *eob_ptr);
 
-// Only used for reference. The optimized versions can handle HBD.
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan);
-
-void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"

From 176acaf9f6efb3603e920eb35630a16f8a88ad5e Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 22 Mar 2022 13:58:50 +0800
Subject: [PATCH 238/926] loongarch: Fix bugs

Fix bugs from loopfilter_filters_lsx.c, vpx_convolve8_avg_lsx.c

Bug: webm:1755

Change-Id: I7ee8e367d66a49f3be10d7e417837d3b6ef50bdb
---
 vp8/common/loongarch/loopfilter_filters_lsx.c |  36 +++--
 vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c     | 132 +++++++++---------
 vpx_dsp/loongarch/vpx_convolve_lsx.h          |  15 ++
 3 files changed, 98 insertions(+), 85 deletions(-)

diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c
index c48f794840..a3ac76d258 100644
--- a/vp8/common/loongarch/loopfilter_filters_lsx.c
+++ b/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -32,9 +32,9 @@
     filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
     filt = __lsx_vand_v(filt, mask);                         \
     t1 = __lsx_vsadd_b(filt, cnst4b);                        \
-    t1 = __lsx_vsra_b(filt, cnst3b);                         \
+    t1 = __lsx_vsra_b(t1, cnst3b);                           \
     t2 = __lsx_vsadd_b(filt, cnst3b);                        \
-    t2 = __lsx_vsra_b(filt, cnst3b);                         \
+    t2 = __lsx_vsra_b(t2, cnst3b);                           \
     q0_m = __lsx_vssub_b(q0_m, t1);                          \
     q0 = __lsx_vxori_b(q0_m, 0x80);                          \
     p0_m = __lsx_vsadd_b(p0_m, t2);                          \
@@ -158,7 +158,6 @@ static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
                                               const uint8_t *b_limit1_ptr,
                                               const uint8_t *limit1_ptr,
                                               const uint8_t *thresh1_ptr) {
-  uint8_t *temp_src;
   int32_t pitch_x2 = pitch << 1;
   int32_t pitch_x3 = pitch_x2 + pitch;
   int32_t pitch_x4 = pitch << 2;
@@ -167,12 +166,11 @@ static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
   __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
   __m128i p3, p2, p1, p0, q3, q2, q1, q0;
 
-  temp_src = src - pitch_x4;
-  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
-            temp_src, pitch_x3, p3, p2, p1, p0);
-  temp_src += pitch_x4;
-  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
-            temp_src, pitch_x3, q0, q1, q2, q3);
+  DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src,
+            -pitch, p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2);
+  q3 = __lsx_vldx(src, pitch_x3);
 
   thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
   thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
@@ -336,15 +334,15 @@ static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
                mask, flat);
   VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
 
-  __lsx_vstelm_d(p1, src_u, 0, 0);
-  __lsx_vstelm_d(p0, src_u + pitch, 0, 0);
-  __lsx_vstelm_d(q0, src_u + pitch_x2, 0, 0);
-  __lsx_vstelm_d(q1, src_u + pitch_x3, 0, 0);
+  __lsx_vstelm_d(q1, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(q0, src_u, 0, 0);
+  __lsx_vstelm_d(p0, src_u - pitch, 0, 0);
+  __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0);
 
-  __lsx_vstelm_d(p1, src_v, 0, 1);
-  __lsx_vstelm_d(p0, src_v + pitch, 0, 1);
-  __lsx_vstelm_d(q0, src_v + pitch_x2, 0, 1);
-  __lsx_vstelm_d(q1, src_v + pitch_x3, 0, 1);
+  __lsx_vstelm_d(q1, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(q0, src_v, 0, 1);
+  __lsx_vstelm_d(p0, src_v - pitch, 0, 1);
+  __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1);
 }
 
 static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
@@ -396,8 +394,8 @@ static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
   tmp2 = __lsx_vilvl_h(tmp1, tmp0);
   tmp3 = __lsx_vilvh_h(tmp1, tmp0);
 
-  tmp0 = __lsx_vilvl_b(p0, q1);
-  tmp1 = __lsx_vilvl_b(q1, q0);
+  tmp0 = __lsx_vilvh_b(p0, p1);
+  tmp1 = __lsx_vilvh_b(q1, q0);
   tmp4 = __lsx_vilvl_h(tmp1, tmp0);
   tmp5 = __lsx_vilvh_h(tmp1, tmp0);
 
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
index 27f5b5ca4f..2b983552b6 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -12,6 +12,15 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
 
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
 static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
@@ -90,7 +99,7 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
     src0 = __lsx_vpackev_b(src1, src0);
     out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
     out0 = __lsx_vxori_b(out0, 128);
     out0 = __lsx_vavgr_bu(out0, src2);
     __lsx_vstelm_w(out0, dst, 0, 0);
@@ -192,7 +201,8 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
     src2 = __lsx_vpackev_b(src10, src9);
     src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
+              FILTER_BITS, out0, out1);
     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
     src5 = __lsx_vldrepl_d(dst_tmp, 0);
     dst_tmp += dst_stride;
@@ -233,8 +243,6 @@ static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(
 
   common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
                                         filter_horiz, filter_vert, height);
-  src += 8;
-  dst += 8;
 }
 
 static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(
@@ -315,7 +323,7 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
 static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
-  uint8_t *dst_tmp1;
+  uint8_t *dst_tmp = dst;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
   __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
@@ -351,26 +359,25 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
   hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
   hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
 
-  dst0 = __lsx_vldrepl_w(dst, 0);
-  dst += dst_stride;
-  dst1 = __lsx_vldrepl_w(dst, 0);
-  dst += dst_stride;
-  dst2 = __lsx_vldrepl_w(dst, 0);
-  dst += dst_stride;
-  dst3 = __lsx_vldrepl_w(dst, 0);
-  dst += dst_stride;
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
   dst0 = __lsx_vilvl_w(dst1, dst0);
   dst1 = __lsx_vilvl_w(dst3, dst2);
   dst0 = __lsx_vilvl_d(dst1, dst0);
 
-  dst1 = __lsx_vldrepl_w(dst, 0);
-  dst += dst_stride;
-  dst2 = __lsx_vldrepl_w(dst, 0);
-  dst += dst_stride;
-  dst3 = __lsx_vldrepl_w(dst, 0);
-  dst += dst_stride;
-  dst4 = __lsx_vldrepl_w(dst, 0);
-  dst += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
   dst1 = __lsx_vilvl_w(dst2, dst1);
   dst2 = __lsx_vilvl_w(dst4, dst3);
   dst1 = __lsx_vilvl_d(dst2, dst1);
@@ -384,23 +391,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
   DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1);
   DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
 
-  dst_tmp1 = dst;
-  __lsx_vstelm_w(res0, dst_tmp1, 0, 0);
-  dst_tmp1 += dst_stride;
-  __lsx_vstelm_w(res0, dst_tmp1, 0, 1);
-  dst_tmp1 += dst_stride;
-  __lsx_vstelm_w(res0, dst_tmp1, 0, 2);
-  dst_tmp1 += dst_stride;
-  __lsx_vstelm_w(res0, dst_tmp1, 0, 3);
-  dst_tmp1 += dst_stride;
-
-  __lsx_vstelm_w(res1, dst_tmp1, 0, 0);
-  dst_tmp1 += dst_stride;
-  __lsx_vstelm_w(res1, dst_tmp1, 0, 1);
-  dst_tmp1 += dst_stride;
-  __lsx_vstelm_w(res1, dst_tmp1, 0, 2);
-  dst_tmp1 += dst_stride;
-  __lsx_vstelm_w(res1, dst_tmp1, 0, 3);
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 3);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_4w_lsx(
@@ -431,12 +437,11 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
   mask = __lsx_vld(mc_filt_mask_arr, 0);
   /* rearranging filter */
   filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
-  filt_vt = __lsx_vldrepl_h(filtrt_ver, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
 
   src0 = __lsx_vld(src, 0);
   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
             src, src_stride4, src1, src2, src3, src4);
-  src += (src_stride4 + src_stride);
 
   dst0 = __lsx_vldrepl_d(dst_tmp, 0);
   dst_tmp += dst_stride;
@@ -445,7 +450,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
   dst2 = __lsx_vldrepl_d(dst_tmp, 0);
   dst_tmp += dst_stride;
   dst3 = __lsx_vldrepl_d(dst_tmp, 0);
-  dst_tmp += dst_stride;
   DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
   hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
@@ -462,12 +466,11 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
 
   hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
   vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
-  tmp3 = __lsx_vdp2_h_bu(vec1, filt_vt);
+  tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
 
   DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
             FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
   PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
-  dst -= dst_stride * 3;
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
@@ -499,28 +502,28 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
   for (; loop_cnt--;) {
     src1 = __lsx_vld(src, 0);
     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
-    src4 = __lsx_vlds(src, src_stride3);
+    src4 = __lsx_vldx(src, src_stride3);
     src += src_stride4;
 
     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
-    tmp0 = __lsx_vavgr_bu(vec0, filt_vt);
+    tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
-    tmp1 = __lsx_vavgr_bu(vec0, filt_vt);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
     DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
 
     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
-    tmp2 = __lsx_vavgr_bu(vec0, filt_vt);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
-    tmp3 = __lsx_vavgr_bu(vec0, filt_vt);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
 
     dst0 = __lsx_vldrepl_d(dst_tmp, 0);
     dst_tmp += dst_stride;
@@ -563,7 +566,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
 
   int32_t dst_stride2 = dst_stride << 1;
   int32_t dst_stride3 = dst_stride2 + dst_stride;
-  int32_t dst_stride4 = dst_stride2 << 1;
+  int32_t dst_stride4 = dst_stride << 2;
 
   mask = __lsx_vld(mc_filt_mask_arr, 0);
   /* rearranging filter */
@@ -584,7 +587,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
     src1 = __lsx_vld(src_tmp1, 0);
     DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
               src5);
-    src5 = __lsx_vldx(src_tmp1, src_stride3);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
     src += src_stride4;
     dst0 = __lsx_vld(dst, 0);
     DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
@@ -593,42 +596,39 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
     DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
     tmp3 = __lsx_vpickev_b(tmp1, tmp0);
     tmp3 = __lsx_vavgr_bu(tmp3, dst0);
     __lsx_vst(tmp3, dst, 0);
-    dst += dst_stride;
 
     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
     DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
     tmp3 = __lsx_vpickev_b(tmp1, tmp0);
     tmp3 = __lsx_vavgr_bu(tmp3, dst1);
-    __lsx_vst(tmp3, dst, 0);
-    dst += dst_stride;
+    __lsx_vstx(tmp3, dst, dst_stride);
 
     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
     DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
     tmp3 = __lsx_vpickev_b(tmp1, tmp0);
     tmp3 = __lsx_vavgr_bu(tmp3, dst2);
-    __lsx_vst(tmp3, dst, 0);
-    dst += dst_stride;
+    __lsx_vstx(tmp3, dst, dst_stride2);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
     DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
     tmp3 = __lsx_vpickev_b(tmp1, tmp0);
     tmp3 = __lsx_vavgr_bu(tmp3, dst3);
-    __lsx_vst(tmp3, dst, 0);
-    dst += dst_stride;
+    __lsx_vstx(tmp3, dst, dst_stride3);
+    dst += dst_stride4;
   }
 }
 
diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h
index 2fdb93db84..0e3dcae006 100644
--- a/vpx_dsp/loongarch/vpx_convolve_lsx.h
+++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -114,4 +114,19 @@
     tmp1_m;                                              \
   })
 
+#define PCKEV_AVG_ST4_D(in0, in1, in2, in3, dst0, dst1, pdst, stride)      \
+  {                                                                        \
+    __m128i tmp0_m, tmp1_m;                                                \
+                                                                           \
+    DUP2_ARG2(__lsx_vpickev_b, in1, in0, in3, in2, tmp0_m, tmp1_m);        \
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 0);                                    \
+    pdst += stride;                                                        \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 1);                                    \
+    pdst += stride;                                                        \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 0);                                    \
+    pdst += stride;                                                        \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 1);                                    \
+  }
+
 #endif  // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_

From d4060647213d51125457ae151a2402bf95ebdf71 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Mon, 28 Mar 2022 17:12:57 +0800
Subject: [PATCH 239/926] vp8[loongarch]: Optimize dequant_idct_add_y/uv_block

1. vp8_dequant_idct_add_uv_block_lsx
2. vp8_dequant_idct_add_y_block_lsx

Bug: webm:1755

Change-Id: I1f006daaefb2075b422bc72a3f69c5abee776e2e
---
 vp8/common/loongarch/idct_lsx.c | 271 ++++++++++++++++++++++++++++++++
 vp8/common/rtcd_defs.pl         |   4 +-
 2 files changed, 273 insertions(+), 2 deletions(-)

diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c
index fb0b0384c4..679019ff63 100644
--- a/vp8/common/loongarch/idct_lsx.c
+++ b/vp8/common/loongarch/idct_lsx.c
@@ -12,6 +12,107 @@
 #include "vp8/common/blockd.h"
 #include "vpx_util/loongson_intrinsics.h"
 
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)    \
+  {                                                                       \
+    __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+                                                                          \
+    DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m);         \
+    DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m);         \
+    DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+    DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+  }
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                     \
+    __m128i s4_m, s5_m, s6_m, s7_m;                                     \
+                                                                        \
+    TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);     \
+    DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2);       \
+    out1 = __lsx_vilvh_d(s6_m, s4_m);                                   \
+    out3 = __lsx_vilvh_d(s7_m, s5_m);                                   \
+  }
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in)         \
+  ({                                                          \
+    __m128i out_m;                                            \
+    __m128i zero_m = __lsx_vldi(0);                           \
+    __m128i tmp1_m, tmp2_m;                                   \
+    __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \
+                                                              \
+    tmp1_m = __lsx_vilvl_h(in, zero_m);                       \
+    tmp2_m = __lsx_vilvh_h(in, zero_m);                       \
+    tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
+    tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
+    tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m);            \
+    tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
+    tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m);            \
+    tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
+    out_m = __lsx_vpickev_h(tmp2_m, tmp1_m);                  \
+                                                              \
+    out_m;                                                    \
+  })
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)      \
+  {                                                                    \
+    __m128i a1_m, b1_m, c1_m, d1_m;                                    \
+    __m128i c_tmp1_m, c_tmp2_m;                                        \
+    __m128i d_tmp1_m, d_tmp2_m;                                        \
+    __m128i const_cospi8sqrt2minus1_m;                                 \
+                                                                       \
+    const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \
+    a1_m = __lsx_vadd_h(in0, in2);                                     \
+    b1_m = __lsx_vsub_h(in0, in2);                                     \
+    c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1);         \
+                                                                       \
+    c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m);           \
+    c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1);                             \
+    c_tmp2_m = __lsx_vsrai_h(c_tmp2_m, 1);                             \
+    c_tmp2_m = __lsx_vadd_h(in3, c_tmp2_m);                            \
+    c1_m = __lsx_vsub_h(c_tmp1_m, c_tmp2_m);                           \
+                                                                       \
+    d_tmp1_m = __lsx_vmuh_h(in1, const_cospi8sqrt2minus1_m);           \
+    d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1);                             \
+    d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1);                             \
+    d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m);                            \
+    d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3);         \
+    d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m);                           \
+    LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+  }
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)      \
+  {                                                                    \
+    __m128i a1_m, b1_m, c1_m, d1_m;                                    \
+    __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                    \
+    __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;                 \
+                                                                       \
+    const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_w(cospi8sqrt2minus1); \
+    sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2);                  \
+    a1_m = __lsx_vadd_w(in0, in2);                                     \
+    b1_m = __lsx_vsub_w(in0, in2);                                     \
+    c_tmp1_m = __lsx_vmul_w(in1, sinpi8_sqrt2_m);                      \
+    c_tmp1_m = __lsx_vsrai_w(c_tmp1_m, 16);                            \
+    c_tmp2_m = __lsx_vmul_w(in3, const_cospi8sqrt2minus1_m);           \
+    c_tmp2_m = __lsx_vsrai_w(c_tmp2_m, 16);                            \
+    c_tmp2_m = __lsx_vadd_w(in3, c_tmp2_m);                            \
+    c1_m = __lsx_vsub_w(c_tmp1_m, c_tmp2_m);                           \
+    d_tmp1_m = __lsx_vmul_w(in1, const_cospi8sqrt2minus1_m);           \
+    d_tmp1_m = __lsx_vsrai_w(d_tmp1_m, 16);                            \
+    d_tmp1_m = __lsx_vadd_w(in1, d_tmp1_m);                            \
+    d_tmp2_m = __lsx_vmul_w(in3, sinpi8_sqrt2_m);                      \
+    d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16);                            \
+    d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m);                           \
+    LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+  }
+
+#define UNPCK_SH_SW(in, out0, out1)  \
+  {                                  \
+    out0 = __lsx_vsllwil_w_h(in, 0); \
+    out1 = __lsx_vexth_w_h(in);      \
+  }
+
 static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred,
                                  int32_t pred_stride, uint8_t *dest,
                                  int32_t dest_stride) {
@@ -52,3 +153,173 @@ void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr,
                               int32_t dst_stride) {
   idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
 }
+
+static void dequant_idct4x4_addblk_2x_lsx(int16_t *input,
+                                          int16_t *dequant_input, uint8_t *dest,
+                                          int32_t dest_stride) {
+  __m128i dest0, dest1, dest2, dest3;
+  __m128i in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+  __m128i hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+  __m128i hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+  __m128i vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t dest_stride2 = dest_stride << 1;
+  int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP2_ARG2(__lsx_vld, dequant_input, 0, dequant_input, 16, dequant_in0,
+            dequant_in1);
+
+  DUP4_ARG2(__lsx_vmul_h, in0, dequant_in0, in1, dequant_in1, in2, dequant_in0,
+            in3, dequant_in1, mul0, mul1, mul2, mul3);
+  DUP2_ARG2(__lsx_vpickev_d, mul2, mul0, mul3, mul1, in0, in2);
+  DUP2_ARG2(__lsx_vpickod_d, mul2, mul0, mul3, mul1, in1, in3);
+
+  VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  UNPCK_SH_SW(hz0, hz0r, hz0l);
+  UNPCK_SH_SW(hz1, hz1r, hz1l);
+  UNPCK_SH_SW(hz2, hz2r, hz2l);
+  UNPCK_SH_SW(hz3, hz3r, hz3l);
+  VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+  DUP4_ARG2(__lsx_vsrari_w, vt0l, 3, vt1l, 3, vt2l, 3, vt3l, 3, vt0l, vt1l,
+            vt2l, vt3l);
+  VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+  DUP4_ARG2(__lsx_vsrari_w, vt0r, 3, vt1r, 3, vt2r, 3, vt3r, 3, vt0r, vt1r,
+            vt2r, vt3r);
+  DUP4_ARG2(__lsx_vpickev_h, vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r,
+            vt0, vt1, vt2, vt3);
+  TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  dest0 = __lsx_vld(dest, 0);
+  DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+  dest3 = __lsx_vldx(dest, dest_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0,
+            res1, res2, res3);
+
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, vt0l, vt1l);
+
+  __lsx_vstelm_d(vt0l, dest, 0, 0);
+  __lsx_vstelm_d(vt0l, dest + dest_stride, 0, 1);
+  __lsx_vstelm_d(vt1l, dest + dest_stride2, 0, 0);
+  __lsx_vstelm_d(vt1l, dest + dest_stride3, 0, 1);
+
+  __lsx_vst(zero, input, 0);
+  __lsx_vst(zero, input, 16);
+  __lsx_vst(zero, input, 32);
+  __lsx_vst(zero, input, 48);
+}
+
+static void dequant_idct_addconst_2x_lsx(int16_t *input, int16_t *dequant_input,
+                                         uint8_t *dest, int32_t dest_stride) {
+  __m128i input_dc0, input_dc1, vec, res0, res1, res2, res3;
+  __m128i dest0, dest1, dest2, dest3;
+  __m128i zero = __lsx_vldi(0);
+  int32_t dest_stride2 = dest_stride << 1;
+  int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+  input_dc0 = __lsx_vreplgr2vr_h(input[0] * dequant_input[0]);
+  input_dc1 = __lsx_vreplgr2vr_h(input[16] * dequant_input[0]);
+  DUP2_ARG2(__lsx_vsrari_h, input_dc0, 3, input_dc1, 3, input_dc0, input_dc1);
+  vec = __lsx_vpickev_d(input_dc1, input_dc0);
+  input[0] = 0;
+  input[16] = 0;
+  dest0 = __lsx_vld(dest, 0);
+  DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+  dest3 = __lsx_vldx(dest, dest_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+            res1, res2, res3);
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, res0, res1);
+  __lsx_vstelm_d(res0, dest, 0, 0);
+  __lsx_vstelm_d(res0, dest + dest_stride, 0, 1);
+  __lsx_vstelm_d(res1, dest + dest_stride2, 0, 0);
+  __lsx_vstelm_d(res1, dest + dest_stride3, 0, 1);
+}
+
+void vp8_dequant_idct_add_y_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst,
+                                      int32_t stride, char *eobs) {
+  int16_t *eobs_h = (int16_t *)eobs;
+  uint8_t i;
+
+  for (i = 4; i--;) {
+    if (eobs_h[0]) {
+      if (eobs_h[0] & 0xfefe) {
+        dequant_idct4x4_addblk_2x_lsx(q, dq, dst, stride);
+      } else {
+        dequant_idct_addconst_2x_lsx(q, dq, dst, stride);
+      }
+    }
+
+    q += 32;
+
+    if (eobs_h[1]) {
+      if (eobs_h[1] & 0xfefe) {
+        dequant_idct4x4_addblk_2x_lsx(q, dq, dst + 8, stride);
+      } else {
+        dequant_idct_addconst_2x_lsx(q, dq, dst + 8, stride);
+      }
+    }
+
+    q += 32;
+    dst += (4 * stride);
+    eobs_h += 2;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int32_t stride,
+                                       char *eobs) {
+  int16_t *eobs_h = (int16_t *)eobs;
+  if (eobs_h[0]) {
+    if (eobs_h[0] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+    }
+  }
+
+  q += 32;
+  dst_u += (stride * 4);
+
+  if (eobs_h[1]) {
+    if (eobs_h[1] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+    }
+  }
+
+  q += 32;
+
+  if (eobs_h[2]) {
+    if (eobs_h[2] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+    }
+  }
+  q += 32;
+  dst_v += (stride * 4);
+
+  if (eobs_h[3]) {
+    if (eobs_h[3] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+    }
+  }
+}
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index c7911032f6..e4b40fa9ed 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -38,10 +38,10 @@ ()
 specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi lsx/;
 
 #
 # Loopfilter

From 8ff9f66b8de7bcec70296c1f304ab409330c3525 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Wed, 30 Mar 2022 14:04:10 +0800
Subject: [PATCH 240/926] vp9[loongarch]: Optimize vpx_convolve8_avg_horiz_c

1. vpx_convolve8_avg_horiz_lsx

Bug: webm:1755

Change-Id: I0b6520be0afa1689da329f56ec6cd95c1730250c
---
 test/convolve_test.cc                         |   8 +-
 .../loongarch/vpx_convolve8_avg_horiz_lsx.c   | 972 ++++++++++++++++++
 vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c   |  11 -
 vpx_dsp/loongarch/vpx_convolve8_lsx.c         |  11 -
 vpx_dsp/loongarch/vpx_convolve_lsx.h          |  11 +
 vpx_dsp/vpx_dsp.mk                            |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl                  |   2 +-
 7 files changed, 989 insertions(+), 27 deletions(-)
 create mode 100644 vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index a631ec77f7..5189be647a 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1452,10 +1452,10 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest,
 #if HAVE_LSX
 const ConvolveFunctions convolve8_lsx(
     vpx_convolve_copy_c, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx,
-    vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_lsx, vpx_convolve8_avg_vert_c,
-    vpx_convolve8_lsx, vpx_convolve8_avg_lsx, vpx_scaled_horiz_c,
-    vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
-    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx,
+    vpx_convolve8_avg_vert_c, vpx_convolve8_lsx, vpx_convolve8_avg_lsx,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) };
 INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest,
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
new file mode 100644
index 0000000000..1c59228813
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
@@ -0,0 +1,972 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1;
+  __m128i dst0, dst1, dst2, dst3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp0, tmp1);
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+  tmp0 = __lsx_vxori_b(tmp0, 128);
+  dst0 = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(dst0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+  dst0 = __lsx_vilvl_d(tmp1, tmp0);
+
+  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+  dst1 = __lsx_vilvl_d(tmp1, tmp0);
+
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp0, tmp1);
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp2, tmp3);
+  DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
+            tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+  __lsx_vstelm_w(dst0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 3);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  int32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, tmp0,
+                               tmp1, tmp2, tmp3);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height >> 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    dst0 = __lsx_vld(dst_tmp, 0);
+    dst1 = __lsx_vldx(dst_tmp, dst_stride);
+    dst_tmp += dst_stride2;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+              mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+              mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+              mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+              mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+              filter0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+              tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+              tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+              tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+              tmp7);
+    DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
+    DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
+    DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst1, dst, dst_stride);
+    dst += dst_stride2;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, dst0, dst1;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+              mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+              mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+              mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+              mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+              filter0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+              tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+              tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+              tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+              tmp7);
+    DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3, dst0, dst1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+    src3 = __lsx_vld(src, 56);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+    __lsx_vst(out0, dst, 32);
+    __lsx_vst(out1, dst, 48);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1);
+  vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS);
+  vec0 = __lsx_vavgr_bu(vec0, dst0);
+  __lsx_vstelm_w(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 3);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i vec4, vec5, vec6, vec7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp1 = (uint8_t *)src + src_stride4;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+
+  src4 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+            src6);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+            res1, res2, res3);
+  DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2);
+  DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 3);
+  dst += dst_stride;
+}
+
+static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec1);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec1, dst, 0, 1);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec2);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 1);
+  dst += dst_stride;
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec2);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 1);
+  dst += dst_stride;
+
+  if (height == 16) {
+    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, vec0, vec2);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 1);
+    dst += dst_stride;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, vec0, vec2);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2) - 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp1 = (uint8_t *)src + 8;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+  src6 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  src1 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+            src5);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+  src_tmp1 += src_stride4;
+
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+            res4, res5, res6, res7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+            FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0,
+            res2, res4, res6);
+  dst0 = __lsx_vld(dst, 0);
+  res0 = __lsx_vavgr_bu(res0, dst0);
+  __lsx_vst(res0, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res2 = __lsx_vavgr_bu(res2, dst0);
+  __lsx_vst(res2, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res4 = __lsx_vavgr_bu(res4, dst0);
+  __lsx_vst(res4, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res6 = __lsx_vavgr_bu(res6, dst0);
+  __lsx_vst(res6, dst, 0);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src_tmp1 += src_stride4;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, res0, res1, res2, res3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, res4, res5, res6, res7);
+
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+              FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+              res0, res2, res4, res6);
+    dst0 = __lsx_vld(dst, 0);
+    res0 = __lsx_vavgr_bu(res0, dst0);
+    __lsx_vst(res0, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res2 = __lsx_vavgr_bu(res2, dst0);
+    __lsx_vst(res2, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res4 = __lsx_vavgr_bu(res4, dst0);
+    __lsx_vst(res4, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res6 = __lsx_vavgr_bu(res6, dst0);
+    __lsx_vst(res6, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0, dst1;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, res0, res1, res2, res3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, res4, res5, res6, res7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+              FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+              res0, res2, res4, res6);
+
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    res0 = __lsx_vavgr_bu(res0, dst0);
+    __lsx_vst(res0, dst, 0);
+    res2 = __lsx_vavgr_bu(res2, dst1);
+    __lsx_vst(res2, dst, 16);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    res4 = __lsx_vavgr_bu(res4, dst0);
+    __lsx_vst(res4, dst, 0);
+    res6 = __lsx_vavgr_bu(res6, dst1);
+    __lsx_vst(res6, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+              src6);
+    src7 = __lsx_vld(src, 56);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out2, out4, out6);
+
+    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2,
+              dst3);
+    out0 = __lsx_vavgr_bu(out0, dst0);
+    __lsx_vst(out0, dst, 0);
+    out2 = __lsx_vavgr_bu(out2, dst1);
+    __lsx_vst(out2, dst, 16);
+    out4 = __lsx_vavgr_bu(out4, dst2);
+    __lsx_vst(out4, dst, 32);
+    out6 = __lsx_vavgr_bu(out6, dst3);
+    __lsx_vst(out6, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    switch (w) {
+      case 4:
+        common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+
+      case 32:
+        common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
index 3608fe326c..5d67d65274 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -12,17 +12,6 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
 
-#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
-  {                                                         \
-    _src0 = __lsx_vld(_src, 0);                             \
-    _src += _stride;                                        \
-    _src1 = __lsx_vld(_src, 0);                             \
-    _src += _stride;                                        \
-    _src2 = __lsx_vld(_src, 0);                             \
-    _src += _stride;                                        \
-    _src3 = __lsx_vld(_src, 0);                             \
-  }
-
 static const uint8_t mc_filt_mask_arr[16 * 3] = {
   /* 8 width cases */
   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
index 51a162bf3e..894c137203 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -12,17 +12,6 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
 
-#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
-  {                                                         \
-    _src0 = __lsx_vld(_src, 0);                             \
-    _src += _stride;                                        \
-    _src1 = __lsx_vld(_src, 0);                             \
-    _src += _stride;                                        \
-    _src2 = __lsx_vld(_src, 0);                             \
-    _src += _stride;                                        \
-    _src3 = __lsx_vld(_src, 0);                             \
-  }
-
 static const uint8_t mc_filt_mask_arr[16 * 3] = {
   /* 8 width cases */
   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h
index 0e3dcae006..d319bc4f7d 100644
--- a/vpx_dsp/loongarch/vpx_convolve_lsx.h
+++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -14,6 +14,17 @@
 #include "vpx_util/loongson_intrinsics.h"
 #include "vpx_dsp/vpx_filter.h"
 
+#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
+  {                                                         \
+    _src0 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src1 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src2 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src3 = __lsx_vld(_src, 0);                             \
+  }
+
 #define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, \
                             _filter2, _filter3)                             \
   ({                                                                        \
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index bf348c1126..c844379364 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -164,6 +164,7 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
 DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
 
 # common (lsx)
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 73f28ff927..649aa17d1d 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -386,7 +386,7 @@ ()
 specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;

From 6ac395ed771a4fa986637197a71ac6fe58d57965 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 31 Mar 2022 12:10:03 -0700
Subject: [PATCH 241/926] Revert "quantize: remove highbd version"

This reverts commit 89cfe3835c47dabf77d38edb3af190155984fa9a.

This is a prerequisite for reverting
2200039d33c49a9f7a5c438656df143755b022c4 which causes high bitdepth test
failures

Bug: webm:1586
Change-Id: I28f3b98f3339f3573b1492b88bf733dade133fc0
---
 test/vp9_quantize_test.cc |  89 ++++---------------------
 vpx_dsp/quantize.c        | 132 ++++++++++++++++++++++++++++++--------
 vpx_dsp/quantize.h        |  17 ++++-
 3 files changed, 134 insertions(+), 104 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index fcc7ff99ec..5773cd9835 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -30,6 +30,7 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_dsp/quantize.h"
 
 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
@@ -466,13 +467,10 @@ INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
                                  VPX_BITS_10, 16, false),
-                      make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
-                                 VPX_BITS_12, 16, false),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
-                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true)));
+                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                                 VPX_BITS_12, 16, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
@@ -486,28 +484,6 @@ INSTANTIATE_TEST_SUITE_P(
 
 #if HAVE_SSSE3
 #if VPX_ARCH_X86_64
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, VP9QuantizeTest,
-    ::testing::Values(
-        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_12, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_10, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_12, 32, false),
-        make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
-                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
-                   &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32,
-                   true)));
-#else
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
@@ -521,24 +497,6 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true)));
-#endif  // #CONFIG_VP9_HIGHBITDEPTH
-#else
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, VP9QuantizeTest,
-    ::testing::Values(
-        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_12, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_10, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_12, 32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
@@ -547,7 +505,7 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&vpx_quantize_b_32x32_ssse3,
                                  &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
                                  false)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // VPX_ARCH_X86_64
 #endif  // HAVE_SSSE3
 
@@ -558,15 +516,15 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
-        make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_12, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c,
+        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10,
+                   16, false),
+        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12,
+                   16, false),
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
                    VPX_BITS_8, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c,
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
                    VPX_BITS_10, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c,
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
                    VPX_BITS_12, 32, false)));
 
 #else
@@ -589,28 +547,6 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_SUITE_P(
-    NEON, VP9QuantizeTest,
-    ::testing::Values(
-        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_12, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_10, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_12, 32, false),
-        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
-                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
-        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
-                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
-                   true)));
-#else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
@@ -624,7 +560,6 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                                  VPX_BITS_8, 32, true)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index c29b99bb0c..5d6ba64a8a 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -147,25 +147,66 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
     if (abs_coeff >= zbins[rc != 0]) {
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+             quant_shift_ptr[rc != 0]) >>
+            16;  // quantization
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]);
+
+      if (tmp) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
-      // High bit depth configurations do not clamp to INT16.
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+
+    if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (abs_coeff >= zbins[rc != 0]) {
       const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
       const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
       const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
-#else
-      const int tmp =
-          clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      const int abs_qcoeff = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                              quant_shift_ptr[rc != 0]) >>
-                             16;  // quantization
-#endif  // CONFIG_VP9_HIGHBITDEPTH
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
       if (abs_qcoeff) eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
+#endif
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -202,23 +243,15 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const int rc = scan[idx_arr[i]];
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    int abs_qcoeff = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-#if CONFIG_VP9_HIGHBITDEPTH
-    // High bit depth configurations do not clamp to INT16.
-    {
-      const int64_t tmp =
-          ((abs_qcoeff * quant_ptr[rc != 0]) >> 16) + abs_qcoeff;
-      abs_qcoeff = (int)((tmp * quant_shift_ptr[rc != 0]) >> 15);
-    }
-#else
-    abs_qcoeff = clamp(abs_qcoeff, INT16_MIN, INT16_MAX);
-    abs_qcoeff = ((((abs_qcoeff * quant_ptr[rc != 0]) >> 16) + abs_qcoeff) *
-                  quant_shift_ptr[rc != 0]) >>
-                 15;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    qcoeff_ptr[rc] = (abs_qcoeff ^ coeff_sign) - coeff_sign;
+    int tmp;
+    int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+    tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+           quant_shift_ptr[rc != 0]) >>
+          15;
+
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
 #if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
     // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
     // truncating with a cast, saturate the value. This is easier to implement
@@ -229,7 +262,54 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
 #endif  // VPX_ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
 
+    if (tmp) eob = idx_arr[i];
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = idx_arr[i];
   }
   *eob_ptr = eob + 1;
 }
+#endif
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 9ac1a47418..0fcd77941b 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -38,7 +38,22 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant, uint16_t *eob_ptr);
 
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+// Only used for reference. The optimized versions can handle HBD.
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan);
+
+void vpx_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"

From d00fd066e85176df0a21de3e99bad92ac2bacb00 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 31 Mar 2022 12:11:01 -0700
Subject: [PATCH 242/926] Revert "quantize: replace highbd versions"

This reverts commit 2200039d33c49a9f7a5c438656df143755b022c4.

This causes failures with VP9/EndToEndTestLarge.EndtoEndPSNRTest/*; it
seems the assembly does not match the C code.

Bug: webm:1586
Change-Id: I4c63beebf88d4c12789d681b0d38014510b147fe
---
 test/vp9_quantize_test.cc                 |  42 +++---
 vp9/encoder/vp9_encodemb.c                |  48 +++----
 vp9/encoder/vp9_quantize.c                |   8 ++
 vpx_dsp/quantize.h                        |  16 ---
 vpx_dsp/vpx_dsp.mk                        |   3 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |   8 ++
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 152 ++++++++++++++++++++++
 7 files changed, 211 insertions(+), 66 deletions(-)
 create mode 100644 vpx_dsp/x86/highbd_quantize_intrin_sse2.c

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 5773cd9835..d54f1bc9cd 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -30,7 +30,6 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
-#include "vpx_dsp/quantize.h"
 
 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
@@ -465,12 +464,22 @@ using std::make_tuple;
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                                 VPX_BITS_10, 16, false),
-                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                                 VPX_BITS_12, 16, false)));
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
@@ -510,24 +519,6 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_SUITE_P(
-    AVX, VP9QuantizeTest,
-    ::testing::Values(
-        make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10,
-                   16, false),
-        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12,
-                   16, false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
-                   VPX_BITS_10, 32, false),
-        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
-                   VPX_BITS_12, 32, false)));
-
-#else
 INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
                          ::testing::Values(make_tuple(&vpx_quantize_b_avx,
                                                       &vpx_quantize_b_c,
@@ -535,7 +526,6 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
                                            make_tuple(&vpx_quantize_b_32x32_avx,
                                                       &vpx_quantize_b_32x32_c,
                                                       VPX_BITS_8, 32, false)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index e708555f89..fa222f9dcf 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -511,28 +511,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                             scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(
+            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
+            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
     }
     return;
@@ -857,9 +857,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                               p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                               eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(
+              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
+              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -876,9 +876,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                         scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -896,9 +896,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
-                         qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                         scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -917,9 +917,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-          vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
-                         qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                         scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 1c401e96b4..9058997b0f 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -164,6 +164,14 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
     return;
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin,
+                          p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
+                          pd->dequant, &p->eobs[block], scan, iscan);
+    return;
+  }
+#endif
   vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round,
                  p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
                  &p->eobs[block], scan, iscan);
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 0fcd77941b..8e138445e2 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -37,22 +37,6 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant, uint16_t *eob_ptr);
-
-// Only used for reference. The optimized versions can handle HBD.
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan);
-
-void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan);
 #endif
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index bf348c1126..a1e511cce0 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -318,6 +318,9 @@ DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+endif
 
 # avg
 DSP_SRCS-yes           += avg.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 73f28ff927..83dbcfdda2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -714,6 +714,14 @@ ()
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_highbd_quantize_b sse2/;
+
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+  }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000000..4535a0f7a2
--- /dev/null
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+                           (int)zbin_ptr[0]);
+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
+
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
+
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+void vpx_highbd_quantize_b_32x32_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  (void)scan;
+
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif

From d04f78b5635e84a9099f2b0105562b87ba75f2cd Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 11 Apr 2022 11:08:12 -0700
Subject: [PATCH 243/926] rate_hist,show_histogram: fix crash w/0 buckets

this can occur if 0 frames are encoded, e.g., due to --skip

see also: https://crbug.com/aomedia/3243

Change-Id: I791d5ad6611dbcb60d790e6b705298328ec48126
---
 rate_hist.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rate_hist.c b/rate_hist.c
index d10e754fee..947950d481 100644
--- a/rate_hist.c
+++ b/rate_hist.c
@@ -196,7 +196,9 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets,
   int width1, width2;
   int i;
 
+  if (!buckets) return;
   assert(bucket != NULL);
+  assert(buckets > 0);
 
   switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
     case 1:

From a3cd75e29bdc1ca9df81f944a6c873220509fda8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 11 Apr 2022 11:41:19 -0700
Subject: [PATCH 244/926] vpxdec: add some allocation checks

see also: https://crbug.com/aomedia/3244

Change-Id: I7d151e63a91b8c1a5ee4e861f0b8461eeece6a2f
---
 vpxdec.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vpxdec.c b/vpxdec.c
index ad368a230b..363eb1a24b 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -815,6 +815,10 @@ static int main_loop(int argc, const char **argv_) {
     ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
     ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
         num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (!ext_fb_list.ext_fb) {
+      fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n");
+      goto fail;
+    }
     if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer,
                                              release_vp9_frame_buffer,
                                              &ext_fb_list)) {
@@ -930,6 +934,11 @@ static int main_loop(int argc, const char **argv_) {
           }
           scaled_img =
               vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+          if (!scaled_img) {
+            fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n",
+                    render_width, render_height);
+            goto fail;
+          }
           scaled_img->bit_depth = img->bit_depth;
         }
 
@@ -966,6 +975,10 @@ static int main_loop(int argc, const char **argv_) {
         if (!img_shifted) {
           img_shifted =
               vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+          if (!img_shifted) {
+            fprintf(stderr, "Failed to allocate image\n");
+            goto fail;
+          }
           img_shifted->bit_depth = output_bit_depth;
         }
         if (output_bit_depth > img->bit_depth) {

From 3a3645dbdc7822f52f7b136861e4447ea8d551f9 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 15 Mar 2022 16:15:36 +0800
Subject: [PATCH 245/926] vp9[loongarch]: Optimize sad64x64/32x32/16x16

1. vpx_sad64x64x4d_lsx
2. vpx_sad32x32x4d_lsx
3. vpx_sad16x16x4d_lsx
4. vpx_sad64x64_lsx
5. vpx_sad32x32_lsx

Bug: webm:1755

Change-Id: Ief71c2216f697b261d7c1fc481c89c9f1a6098e6
---
 test/sad_test.cc             |  18 ++
 vpx_dsp/loongarch/sad_lsx.c  | 378 +++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk           |   2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  10 +-
 4 files changed, 403 insertions(+), 5 deletions(-)
 create mode 100644 vpx_dsp/loongarch/sad_lsx.c

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 560c5f3823..aec4cbc380 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1128,4 +1128,22 @@ const SadMxNx4Param x4d_mmi_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
 #endif  // HAVE_MMI
+
+//------------------------------------------------------------------------------
+// loongarch functions
+#if HAVE_LSX
+const SadMxNParam lsx_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_lsx),
+  SadMxNParam(32, 32, &vpx_sad32x32_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests));
+
+const SadMxNx4Param x4d_lsx_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests));
+#endif  // HAVE_LSX
+
 }  // namespace
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
new file mode 100644
index 0000000000..59b268ca1f
--- /dev/null
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -0,0 +1,378 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define SAD_UB2_UH(in0, in1, ref0, ref1)           \
+  ({                                               \
+    __m128i diff0_m, diff1_m, sad_m0;              \
+    __m128i sad_m = __lsx_vldi(0);                 \
+                                                   \
+    diff0_m = __lsx_vabsd_bu(in0, ref0);           \
+    diff1_m = __lsx_vabsd_bu(in1, ref1);           \
+                                                   \
+    sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); \
+    sad_m = __lsx_vadd_h(sad_m, sad_m0);           \
+    sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); \
+    sad_m = __lsx_vadd_h(sad_m, sad_m0);           \
+                                                   \
+    sad_m;                                         \
+  })
+
+#define HADD_UW_U32(in)                          \
+  ({                                             \
+    __m128i res0_m;                              \
+    uint32_t sum_m;                              \
+    res0_m = __lsx_vhaddw_du_wu(in, in);         \
+    res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); \
+    sum_m = __lsx_vpickve2gr_w(res0_m, 0);       \
+    sum_m;                                       \
+  })
+
+#define HADD_UH_U32(in)                 \
+  ({                                    \
+    __m128i res_m;                      \
+    uint32_t sum_m;                     \
+    res_m = __lsx_vhaddw_wu_hu(in, in); \
+    sum_m = HADD_UW_U32(res_m);         \
+    sum_m;                              \
+  })
+
+static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 2);
+  __m128i src0, src1, ref0, ref1;
+  __m128i sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 1);
+  uint32_t sad = 0;
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+  }
+
+  sad = HADD_UH_U32(sad0);
+  sad += HADD_UH_U32(sad1);
+
+  return sad;
+}
+
+static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  int32_t ht_cnt = (height >> 1);
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    ref0_ptr += ref_stride;
+    ref1 = __lsx_vld(ref1_ptr, 0);
+    ref1_ptr += ref_stride;
+    ref2 = __lsx_vld(ref2_ptr, 0);
+    ref2_ptr += ref_stride;
+    ref3 = __lsx_vld(ref3_ptr, 0);
+    ref3_ptr += ref_stride;
+
+    diff = __lsx_vabsd_bu(src, ref0);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref1);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref2);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref3);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    ref0_ptr += ref_stride;
+    ref1 = __lsx_vld(ref1_ptr, 0);
+    ref1_ptr += ref_stride;
+    ref2 = __lsx_vld(ref2_ptr, 0);
+    ref2_ptr += ref_stride;
+    ref3 = __lsx_vld(ref3_ptr, 0);
+    ref3_ptr += ref_stride;
+
+    diff = __lsx_vabsd_bu(src, ref0);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref1);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref2);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref3);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt = height;
+  __m128i src0, src1, ref0, ref1, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+
+    DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
+    ref0_ptr += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
+    ref1_ptr += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
+    ref2_ptr += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
+    ref3_ptr += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i sad, sad_tmp;
+
+  __m128i sad0_0 = __lsx_vldi(0);
+  __m128i sad0_1 = sad0_0;
+  __m128i sad1_0 = sad0_0;
+  __m128i sad1_1 = sad0_0;
+  __m128i sad2_0 = sad0_0;
+  __m128i sad2_1 = sad0_0;
+  __m128i sad3_0 = sad0_0;
+  __m128i sad3_1 = sad0_0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref0_ptr += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref1_ptr += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref2_ptr += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref3_ptr += ref_stride;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
+  }
+  sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[0] = HADD_UW_U32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[1] = HADD_UW_U32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[2] = HADD_UW_U32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[3] = HADD_UW_U32(sad);
+}
+
+#define VPX_SAD_32xHT_LSX(height)                                             \
+  uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_32width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_64xHT_LSX(height)                                             \
+  uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_64width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_16xHTx4D_LSX(height)                                       \
+  void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_32xHTx4D_LSX(height)                                       \
+  void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_64xHTx4D_LSX(height)                                       \
+  void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define SAD64 VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64)
+
+SAD64
+
+#define SAD32 VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32)
+
+SAD32
+
+#define SAD16 VPX_SAD_16xHTx4D_LSX(16)
+
+SAD16
+
+#undef SAD64
+#undef SAD32
+#undef SAD16
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 62fed9e2ee..2289f6f225 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -356,6 +356,8 @@ DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/sad_lsx.c
+
 DSP_SRCS-$(HAVE_MMI)    += mips/sad_mmi.c
 DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index af9be16a77..7513af5f77 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -735,7 +735,7 @@ ()
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
@@ -744,7 +744,7 @@ ()
 specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
@@ -877,7 +877,7 @@ ()
 # Multi-block SAD, comparing a reference to N independent blocks
 #
 add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
@@ -886,7 +886,7 @@ ()
 specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
@@ -895,7 +895,7 @@ ()
 specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;

From caf65c14a82e8d66af9d016738d210b0b307d7eb Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Sat, 19 Mar 2022 10:44:27 +0800
Subject: [PATCH 246/926] vp9[loongarch]: Optimize vpx_variance64x64/32x32

1. vpx_variance64x64_lsx
2. vpx_variance32x32_lsx

Bug: webm:1755

Change-Id: I45c5aa94cbbf7128473894a990d931acaa40e102
---
 test/variance_test.cc            |   7 ++
 vpx_dsp/loongarch/variance_lsx.c | 161 +++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk               |   2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl     |   4 +-
 4 files changed, 172 insertions(+), 2 deletions(-)
 create mode 100644 vpx_dsp/loongarch/variance_lsx.c

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 660bbd0ed7..8060875197 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1649,4 +1649,11 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx),
+                      VarianceParams(5, 5, &vpx_variance32x32_lsx)));
+#endif
 }  // namespace
diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c
new file mode 100644
index 0000000000..8164e98189
--- /dev/null
+++ b/vpx_dsp/loongarch/variance_lsx.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define HADD_SW_S32(in)                        \
+  ({                                           \
+    __m128i res0_m;                            \
+    int32_t sum_m;                             \
+                                               \
+    res0_m = __lsx_vhaddw_d_w(in, in);         \
+    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+    sum_m = __lsx_vpickve2gr_w(res0_m, 0);     \
+    sum_m;                                     \
+  })
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                                \
+  {                                                                       \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+    sub = __lsx_vadd_h(sub, res_l0_m);                                    \
+    sub = __lsx_vadd_h(sub, res_l1_m);                                    \
+  }
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt = (height >> 2);
+  __m128i avg = __lsx_vldi(0);
+  __m128i src0, src1, ref0, ref1;
+  __m128i vec;
+  __m128i var = avg;
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt = 32;
+  __m128i avg0 = __lsx_vldi(0);
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i vec0, vec1;
+  __m128i avg1 = avg0;
+  __m128i avg2 = avg0;
+  __m128i avg3 = avg0;
+  __m128i var = avg0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+  vec0 = __lsx_vhaddw_w_h(avg0, avg0);
+  vec1 = __lsx_vhaddw_w_h(avg1, avg1);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  vec1 = __lsx_vhaddw_w_h(avg2, avg2);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  vec1 = __lsx_vhaddw_w_h(avg3, avg3);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  *diff = HADD_SW_S32(vec0);
+
+  return HADD_SW_S32(var);
+}
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_VARIANCE_WDXHT_LSX(wd, ht)                                         \
+  uint32_t vpx_variance##wd##x##ht##_lsx(                                      \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
+      int32_t ref_stride, uint32_t *sse) {                                     \
+    int32_t diff;                                                              \
+                                                                               \
+    *sse =                                                                     \
+        sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \
+                                                                               \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
+  }
+
+VPX_VARIANCE_WDXHT_LSX(32, 32)
+
+uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx64H(*sse, diff);
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 2289f6f225..198e0060f7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -390,6 +390,8 @@ DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.c
+
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 7513af5f77..bcc1b916cc 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1080,7 +1080,7 @@ ()
 # Variance
 #
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/;
@@ -1089,7 +1089,7 @@ ()
   specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/;

From d387c89e86de35fe3f12b5c9db2919bc82f90157 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Thu, 31 Mar 2022 15:01:30 +0800
Subject: [PATCH 247/926] Update loongson_intrinsics.h from v1.0.5 to v1.2.1

Bug: webm:1755

Change-Id: Ib636d2aa521332b76b6aa1b0aa0a9005aafbf32b
---
 vpx_util/loongson_intrinsics.h | 259 ++++++++++++++++++++++++++++++---
 1 file changed, 240 insertions(+), 19 deletions(-)

diff --git a/vpx_util/loongson_intrinsics.h b/vpx_util/loongson_intrinsics.h
index a34b6e8b44..b8b9e6db02 100644
--- a/vpx_util/loongson_intrinsics.h
+++ b/vpx_util/loongson_intrinsics.h
@@ -39,8 +39,8 @@
  * MICRO version: Comment changes or implementation changes.
  */
 #define LSOM_VERSION_MAJOR 1
-#define LSOM_VERSION_MINOR 0
-#define LSOM_VERSION_MICRO 5
+#define LSOM_VERSION_MINOR 2
+#define LSOM_VERSION_MICRO 1
 
 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
   {                                               \
@@ -90,8 +90,8 @@
  *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
- *               Then the results plus to signed half-word elements from in_c.
+ *               each other to get a result twice the size of input. Then
+ *               the results are added to signed half-word elements from in_c.
  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
  *        in_c : 1,2,3,4, 1,2,3,4
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
@@ -116,9 +116,9 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
  *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               unsigned byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
- *               The results plus to signed half-word elements from in_c.
- * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
  *        in_c : 1,2,3,4, 1,2,3,4
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
@@ -134,6 +134,32 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
   return out;
 }
 
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ *        in_c : 1,1,1,1, 1,1,1,1
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ *         out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+                                           __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
 /*
  * =============================================================================
  * Description : Dot product & addition of half-word vector elements
@@ -142,8 +168,8 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
  *               Return Type - __m128i
  * Details     : Signed half-word elements from in_h are multiplied by
  *               signed half-word elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
- *               Then the results plus to signed word elements from in_c.
+ *               each other to get a result twice the size of input.
+ *               Then the results are added to signed word elements from in_c.
  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
  *        in_c : 1,2,3,4
  *        in_h : 1,2,3,4, 5,6,7,8
@@ -168,7 +194,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
  *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
+ *               each other to get a result twice the size of input.
  * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
@@ -191,7 +217,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
  *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               unsigned byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
+ *               each other to get a result twice the size of input.
  * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
@@ -214,7 +240,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
  *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
+ *               each other to get a result twice the size of input.
  * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
@@ -237,7 +263,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
  *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
+ *               each other to get a result twice the size of input.
  * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1
@@ -252,6 +278,29 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
   return out;
 }
 
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - double
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_d_w(in_h, in_l)
+ *        in_h : 1,2,3,4
+ *        in_l : 8,7,6,5
+ *         out : 22,38
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_d_w(in_h, in_l);
+  out = __lsx_vmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
 /*
  * =============================================================================
  * Description : Clip all halfword elements of input vector between min & max
@@ -679,6 +728,132 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) {
     _out7 = __lsx_vsub_d(_in0, _in7);                                      \
   }
 
+/*
+ * =============================================================================
+ * Description : Butterfly of 16 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in15;
+ *              _out1 = _in1 + _in14;
+ *              _out2 = _in2 + _in13;
+ *              _out3 = _in3 + _in12;
+ *              _out4 = _in4 + _in11;
+ *              _out5 = _in5 + _in10;
+ *              _out6 = _in6 + _in9;
+ *              _out7 = _in7 + _in8;
+ *              _out8 = _in7 - _in8;
+ *              _out9 = _in6 - _in9;
+ *              _out10 = _in5 - _in10;
+ *              _out11 = _in4 - _in11;
+ *              _out12 = _in3 - _in12;
+ *              _out13 = _in2 - _in13;
+ *              _out14 = _in1 - _in14;
+ *              _out15 = _in0 - _in15;
+ * =============================================================================
+ */
+
+#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_b(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_b(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_b(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_b(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_b(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_b(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_b(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_b(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_b(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_b(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_b(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_b(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_b(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_b(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_b(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_h(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_h(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_h(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_h(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_h(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_h(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_h(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_h(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_h(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_h(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_h(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_h(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_h(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_h(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_h(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_w(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_w(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_w(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_w(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_w(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_w(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_w(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_w(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_w(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_w(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_w(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_w(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_w(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_w(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_w(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_d(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_d(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_d(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_d(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_d(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_d(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_d(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_d(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_d(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_d(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_d(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_d(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_d(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_d(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_d(_in0, _in15);                                        \
+  }
+
 #endif  // LSX
 
 #ifdef __loongarch_asx
@@ -692,7 +867,7 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) {
  * Details     : Unsigned byte elements from in_h are multiplied with
  *               unsigned byte elements from in_l producing a result
  *               twice the size of input i.e. signed halfword.
- *               Then this multiplied results of adjacent odd-even elements
+ *               Then these multiplied results of adjacent odd-even elements
  *               are added to the out vector
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
@@ -714,7 +889,7 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
  * Details     : Signed byte elements from in_h are multiplied with
  *               signed byte elements from in_l producing a result
  *               twice the size of input i.e. signed halfword.
- *               Then this multiplication results of adjacent odd-even elements
+ *               Then these multiplication results of adjacent odd-even elements
  *               are added to the out vector
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
@@ -736,7 +911,7 @@ static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
  * Details     : Signed halfword elements from in_h are multiplied with
  *               signed halfword elements from in_l producing a result
  *               twice the size of input i.e. signed word.
- *               Then this multiplied results of adjacent odd-even elements
+ *               Then these multiplied results of adjacent odd-even elements
  *               are added to the out vector.
  * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
@@ -761,7 +936,7 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
  * Details     : Signed word elements from in_h are multiplied with
  *               signed word elements from in_l producing a result
  *               twice the size of input i.e. signed double-word.
- *               Then this multiplied results of adjacent odd-even elements
+ *               Then these multiplied results of adjacent odd-even elements
  *               are added to the out vector.
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
@@ -805,7 +980,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
  * Details     : Signed byte elements from in_h are multiplied with
  *               signed byte elements from in_l producing a result
  *               twice the size of input i.e. signed halfword.
- *               Then this multiplied results of adjacent odd-even elements
+ *               Then these multiplied results of adjacent odd-even elements
  *               are added to the in_c vector.
  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  * =============================================================================
@@ -819,6 +994,52 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
   return out;
 }
 
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
 /*
  * =============================================================================
  * Description : Dot product of halfword vector elements
@@ -955,7 +1176,7 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
  * Details     : Signed halfword elements from in_h are multiplied with
  *               signed halfword elements from in_l producing a result
  *               four times the size of input i.e. signed doubleword.
- *               Then this multiplication results of four adjacent elements
+ *               Then these multiplication results of four adjacent elements
  *               are added together and stored to the out vector.
  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1

From e87f6d0a2c9fa5dd9267f9e930b5ab5921bdaae6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 13 Apr 2022 22:14:33 -0700
Subject: [PATCH 248/926] vp8,define_gf_group: remove unused variable

this clears a warning under clang-13:
vp8/encoder/firstpass.c:1634:10: warning: variable
'mod_err_per_mb_accumulator' set but not used
[-Wunused-but-set-variable]

Change-Id: I694a99d56724be89090e01c45559237c0fda147a
---
 vp8/encoder/firstpass.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 14164ebc51..ed177e3cb6 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1631,7 +1631,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
-  double mod_err_per_mb_accumulator = 0.0;
 
   int max_bits = frame_max_bits(cpi); /* Max for a single frame */
 
@@ -1682,9 +1681,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     gf_group_err += mod_frame_err;
 
-    mod_err_per_mb_accumulator +=
-        mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
-
     if (EOF == input_stats(cpi, &next_frame)) break;
 
     /* Test for the case where there is a brief flash but the prediction

From a165f4ba64ec8c992ca57a1b4444cd4a19527dde Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 13 Apr 2022 22:16:30 -0700
Subject: [PATCH 249/926] vp9,update_mbgraph_frame_stats: rm unused variables

this quiets warnings under clang-13 of the form:
../vp9/encoder/vp9_mbgraph.c:222:42: warning: variable 'gld_y_offset'
set but not used [-Wunused-but-set-variable]

Change-Id: I32170b90c07058f780b4e8100ee5217232149db8
---
 vp9/encoder/vp9_mbgraph.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 831c79c175..7c2790cb98 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -219,7 +219,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
 
   int mb_col, mb_row, offset = 0;
-  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  int mb_y_offset = 0;
   MV gld_top_mv = { 0, 0 };
   MODE_INFO mi_local;
   MODE_INFO mi_above, mi_left;
@@ -243,8 +243,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
     int mb_y_in_offset = mb_y_offset;
-    int arf_y_in_offset = arf_y_offset;
-    int gld_y_in_offset = gld_y_offset;
 
     // Set up limit values for motion vectors to prevent them extending outside
     // the UMV borders.
@@ -266,8 +264,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
       xd->left_mi = &mi_left;
 
       mb_y_in_offset += 16;
-      gld_y_in_offset += 16;
-      arf_y_in_offset += 16;
       x->mv_limits.col_min -= 16;
       x->mv_limits.col_max -= 16;
     }
@@ -276,8 +272,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
     xd->above_mi = &mi_above;
 
     mb_y_offset += buf->y_stride * 16;
-    gld_y_offset += golden_ref->y_stride * 16;
-    if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
     x->mv_limits.row_min -= 16;
     x->mv_limits.row_max -= 16;
     offset += cm->mb_cols;

From 474a50c64837d05c68e6aa3d24ae096c53f2757d Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 6 Apr 2022 09:46:57 -0700
Subject: [PATCH 250/926] Fix int overflow in intermediate calculation

This is not a complete fix to webm:1751.

Bug: webm:1751
Change-Id: Ieed6c823744f5f0625d529db3746cfe4f549c8c0
---
 vp8/encoder/bitstream.c | 6 ++----
 vp8/encoder/boolhuff.h  | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 87825fa6fe..0e97af5f2e 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -172,9 +172,8 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
         validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error);
 
         w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff;
-        lowvalue <<= offset;
         shift = count;
-        lowvalue &= 0xffffff;
+        lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
         count -= 8;
       }
 
@@ -223,9 +222,8 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
             validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error);
 
             w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff;
-            lowvalue <<= offset;
             shift = count;
-            lowvalue &= 0xffffff;
+            lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
             count -= 8;
           }
 
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 8cc61bdd44..a8c536b99c 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -94,9 +94,8 @@ static void vp8_encode_bool(BOOL_CODER *bc, int bit, int probability) {
     validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error);
     bc->buffer[bc->pos++] = (lowvalue >> (24 - offset) & 0xff);
 
-    lowvalue <<= offset;
     shift = count;
-    lowvalue &= 0xffffff;
+    lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
     count -= 8;
   }
 

From 73b8aade83cd7d0fffe29254b931a34ad4621510 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 13 Apr 2022 22:24:08 -0700
Subject: [PATCH 251/926] temporal_filter_sse4: remove unused function params

this clears warnings under clang-13 of the form:
../vp9/encoder/x86/temporal_filter_sse4.c:275:39: warning: parameter
'u_pre' set but not used [-Wunused-but-set-parameter]

Change-Id: I21519b5b0b9c21b04b174327415e0e73b56bdfda
---
 vp9/encoder/x86/temporal_filter_sse4.c | 179 ++++++++-----------------
 1 file changed, 59 insertions(+), 120 deletions(-)

diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index 437f49f5a0..bdbd66051d 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -270,13 +270,11 @@ static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
 // size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_apply_temporal_filter_luma_16(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
-    const uint16_t *v_dist, const int16_t *const *neighbors_first,
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
     const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
     const int *blk_fw) {
   const int rounding = (1 << strength) >> 1;
@@ -301,7 +299,6 @@ static void vp9_apply_temporal_filter_luma_16(
   assert(strength <= 6);
 
   assert(block_width == 16);
-
   (void)block_width;
 
   // Initialize the weights
@@ -342,17 +339,12 @@ static void vp9_apply_temporal_filter_luma_16(
   accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
                           y_accum);
 
-  y_src += y_src_stride;
   y_pre += y_pre_stride;
   y_count += y_pre_stride;
   y_accum += y_pre_stride;
   y_dist += DIST_STRIDE;
 
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
 
   // Then all the rows except the last one
@@ -392,11 +384,7 @@ static void vp9_apply_temporal_filter_luma_16(
       read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
                               &v_first, &v_second);
 
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
       u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
       v_dist += DIST_STRIDE;
     }
 
@@ -413,7 +401,6 @@ static void vp9_apply_temporal_filter_luma_16(
     accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
                             y_accum);
 
-    y_src += y_src_stride;
     y_pre += y_pre_stride;
     y_count += y_pre_stride;
     y_accum += y_pre_stride;
@@ -458,13 +445,10 @@ static void vp9_apply_temporal_filter_luma_16(
 
 // Perform temporal filter for the luma component.
 static void vp9_apply_temporal_filter_luma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
-    const uint16_t *u_dist, const uint16_t *v_dist) {
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
   unsigned int blk_col = 0, uv_blk_col = 0;
   const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
   const unsigned int mid_width = block_width >> 1,
@@ -482,21 +466,16 @@ static void vp9_apply_temporal_filter_luma(
     neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
     if (use_whole_blk) {
       vp9_apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-          bottom_weight, NULL);
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
     } else {
       vp9_apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
     }
 
     return;
@@ -506,9 +485,7 @@ static void vp9_apply_temporal_filter_luma(
   neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
   neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
   vp9_apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
       use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
       neighbors_second, top_weight, bottom_weight, NULL);
@@ -521,13 +498,10 @@ static void vp9_apply_temporal_filter_luma(
   for (; blk_col < mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
   }
 
   if (!use_whole_blk) {
@@ -539,21 +513,16 @@ static void vp9_apply_temporal_filter_luma(
   for (; blk_col < last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
   }
 
   // Right
   neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
   vp9_apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
       use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
       neighbors_second, top_weight, bottom_weight, NULL);
@@ -564,10 +533,7 @@ static void vp9_apply_temporal_filter_luma(
 // blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_apply_temporal_filter_chroma_8(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
     unsigned int uv_block_height, int ss_x, int ss_y, int strength,
     uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
@@ -587,8 +553,6 @@ static void vp9_apply_temporal_filter_chroma_8(
   // Loop variable
   unsigned int h;
 
-  (void)uv_block_width;
-
   // Initilize weight
   if (blk_fw) {
     weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
@@ -621,10 +585,8 @@ static void vp9_apply_temporal_filter_chroma_8(
   accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
   accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
 
-  u_src += uv_src_stride;
   u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
   v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
   u_count += uv_pre_stride;
@@ -632,8 +594,6 @@ static void vp9_apply_temporal_filter_chroma_8(
   v_count += uv_pre_stride;
   v_accum += uv_pre_stride;
 
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
   y_dist += DIST_STRIDE * (1 + ss_y);
 
   // Then all the rows except the last one
@@ -676,10 +636,8 @@ static void vp9_apply_temporal_filter_chroma_8(
     accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
     accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
 
-    u_src += uv_src_stride;
     u_pre += uv_pre_stride;
     u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
     v_pre += uv_pre_stride;
     v_dist += DIST_STRIDE;
     u_count += uv_pre_stride;
@@ -687,8 +645,6 @@ static void vp9_apply_temporal_filter_chroma_8(
     v_count += uv_pre_stride;
     v_accum += uv_pre_stride;
 
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
     y_dist += DIST_STRIDE * (1 + ss_y);
   }
 
@@ -719,12 +675,10 @@ static void vp9_apply_temporal_filter_chroma_8(
 
 // Perform temporal filter for the chroma components.
 static void vp9_apply_temporal_filter_chroma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
   const unsigned int uv_width = block_width >> ss_x,
                      uv_height = block_height >> ss_y;
@@ -751,22 +705,17 @@ static void vp9_apply_temporal_filter_chroma(
 
     if (use_whole_blk) {
       vp9_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          top_weight, bottom_weight, NULL);
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
     } else {
       vp9_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          0, 0, blk_fw);
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
     }
 
     return;
@@ -782,10 +731,8 @@ static void vp9_apply_temporal_filter_chroma(
   }
 
   vp9_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
       v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
       bottom_weight, NULL);
@@ -805,13 +752,11 @@ static void vp9_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
   }
 
   if (!use_whole_blk) {
@@ -823,13 +768,11 @@ static void vp9_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
   }
 
   // Right
@@ -842,10 +785,8 @@ static void vp9_apply_temporal_filter_chroma(
   }
 
   vp9_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
       v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
       bottom_weight, NULL);
@@ -922,14 +863,12 @@ void vp9_apply_temporal_filter_sse4_1(
   u_dist_ptr = u_dist + 1;
   v_dist_ptr = v_dist + 1;
 
-  vp9_apply_temporal_filter_luma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr,
-      u_dist_ptr, v_dist_ptr);
+  vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                                 ss_x, ss_y, strength, blk_fw_ptr,
+                                 use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                 u_dist_ptr, v_dist_ptr);
 
   vp9_apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
       u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
       strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
       y_dist_ptr, u_dist_ptr, v_dist_ptr);

From a067d8a5bcfda7ed8b967477c06dab3151be0d24 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Wed, 16 Mar 2022 14:16:33 +0800
Subject: [PATCH 252/926] vp9[loongarch]: Optimize vpx_fdct32x32/32x32_rd

1. vpx_fdct32x32_lsx
2. vpx_fdct32x32_rd_lsx

Bug: webm:1755

Change-Id: I83bce11c0d905cf137545a46cd756aef9cedce47
---
 test/dct32x32_test.cc                |    9 +
 vpx_dsp/loongarch/fwd_dct32x32_lsx.c | 1176 ++++++++++++++++++++++++++
 vpx_dsp/loongarch/fwd_txfm_lsx.h     |   94 ++
 vpx_dsp/loongarch/txfm_macros_lsx.h  |   47 +
 vpx_dsp/vpx_dsp.mk                   |    3 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |    4 +-
 6 files changed, 1331 insertions(+), 2 deletions(-)
 create mode 100644 vpx_dsp/loongarch/fwd_dct32x32_lsx.c
 create mode 100644 vpx_dsp/loongarch/fwd_txfm_lsx.h
 create mode 100644 vpx_dsp/loongarch/txfm_macros_lsx.h

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 8398e17e81..a764d187a3 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -396,4 +396,13 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&vpx_fdct32x32_rd_vsx,
                                  &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Trans32x32Test,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_lsx, &vpx_idct32x32_1024_add_c,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_lsx,
+                                 &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
new file mode 100644
index 0000000000..e5c301b2c1
--- /dev/null
+++ b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
@@ -0,0 +1,1176 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+#define UNPCK_SH_SW(in, out0, out1)  \
+  {                                  \
+    out0 = __lsx_vsllwil_w_h(in, 0); \
+    out1 = __lsx_vexth_w_h(in);      \
+  }
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+                                              int32_t src_stride,
+                                              int16_t *temp_buff) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i step0, step1, step2, step3;
+  __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+  __m128i step0_1, step1_1, step2_1, step3_1;
+
+  int32_t stride = src_stride << 1;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  const int16_t *input_tmp = (int16_t *)input;
+
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+  in3 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in0_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+  in3_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp = input + (src_stride * 24);
+  in4_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+  in7_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+  in7 = __lsx_vldx(input_tmp, stride3);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+            in2_1, in3_1);
+  DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+            in6_1, in7_1);
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+                    step3, in4, in5, in6, in7);
+  LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                    step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+                    in7_1);
+
+  __lsx_vst(step0, temp_buff, 0);
+  __lsx_vst(step1, temp_buff, 16);
+  __lsx_vst(step2, temp_buff, 32);
+  __lsx_vst(step3, temp_buff, 48);
+
+  __lsx_vst(in4, temp_buff, 448);
+  __lsx_vst(in5, temp_buff, 464);
+  __lsx_vst(in6, temp_buff, 480);
+  __lsx_vst(in7, temp_buff, 496);
+
+  __lsx_vst(step0_1, temp_buff, 64);
+  __lsx_vst(step1_1, temp_buff, 80);
+  __lsx_vst(step2_1, temp_buff, 96);
+  __lsx_vst(step3_1, temp_buff, 112);
+
+  __lsx_vst(in4_1, temp_buff, 384);
+  __lsx_vst(in5_1, temp_buff, 400);
+  __lsx_vst(in6_1, temp_buff, 416);
+  __lsx_vst(in7_1, temp_buff, 432);
+
+  /* 3rd and 4th set */
+  input_tmp = input + (src_stride * 8);
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+  in3 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in0_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+  in3_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+  in7_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+  in7 = __lsx_vldx(input_tmp, stride3);
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+            in2_1, in3_1);
+  DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+            in6_1, in7_1);
+
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+                    step3, in4, in5, in6, in7);
+  LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                    step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+                    in7_1);
+
+  __lsx_vst(step0, temp_buff, 128);
+  __lsx_vst(step1, temp_buff, 144);
+  __lsx_vst(step2, temp_buff, 160);
+  __lsx_vst(step3, temp_buff, 176);
+
+  __lsx_vst(in4, temp_buff, 320);
+  __lsx_vst(in5, temp_buff, 336);
+  __lsx_vst(in6, temp_buff, 352);
+  __lsx_vst(in7, temp_buff, 368);
+
+  __lsx_vst(step0_1, temp_buff, 192);
+  __lsx_vst(step1_1, temp_buff, 208);
+  __lsx_vst(step2_1, temp_buff, 224);
+  __lsx_vst(step3_1, temp_buff, 240);
+
+  __lsx_vst(in4_1, temp_buff, 256);
+  __lsx_vst(in5_1, temp_buff, 272);
+  __lsx_vst(in6_1, temp_buff, 288);
+  __lsx_vst(in7_1, temp_buff, 304);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i temp0, temp1;
+
+  /* fdct even */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+            in13, in14, in15);
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1,
+                    vec2, vec3, in12, in13, in14, in15);
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+            in10, in11);
+  LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6,
+                    vec7, in8, in9, in10, in11);
+
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 0);
+  __lsx_vst(temp1, temp, 1024);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 512);
+  __lsx_vst(temp1, temp, 1536);
+
+  DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7,
+            vec6, vec5, vec4);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 256);
+  __lsx_vst(temp1, temp, 1792);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 1280);
+  __lsx_vst(temp1, temp, 768);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 128);
+  __lsx_vst(temp1, temp, 1920);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 1152);
+  __lsx_vst(temp1, temp, 896);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 640);
+  __lsx_vst(temp1, temp, 1408);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 384);
+  __lsx_vst(temp1, temp, 1664);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+  __m128i tmp0, tmp1;
+
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21,
+            in26, in27);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19,
+            in28, in29);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, input, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, input, 80);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, input, 160);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, input, 176);
+
+  in21 = __lsx_vadd_h(in18, in21);
+  in20 = __lsx_vadd_h(in19, in20);
+  in27 = __lsx_vadd_h(in28, in27);
+  in26 = __lsx_vadd_h(in29, in26);
+
+  DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22,
+            in23, in24, in25);
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17,
+            in30, in31);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, input, 32);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, input, 48);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, input, 192);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, input, 208);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 0);
+  __lsx_vst(vec4, temp_ptr, 1920);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 896);
+  __lsx_vst(vec4, temp_ptr, 1024);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec4, temp_ptr, 1408);
+  __lsx_vst(vec5, temp_ptr, 512);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec4, temp_ptr, 384);
+  __lsx_vst(vec5, temp_ptr, 1536);
+
+  DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23,
+            in20, in21);
+  DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26,
+            in27, in24, in25);
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 1664);
+  __lsx_vst(vec4, temp_ptr, 256);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 640);
+  __lsx_vst(vec4, temp_ptr, 1280);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 1152);
+  __lsx_vst(vec4, temp_ptr, 768);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 128);
+  __lsx_vst(vec4, temp_ptr, 1792);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
+  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+                                           int16_t *output) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i step0, step1, step2, step3, step4, step5, step6, step7;
+
+  DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff,
+            192, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384,
+            temp_buff, 448, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff,
+            240, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432,
+            temp_buff, 496, in12, in13, in14, in15);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, step0, step1, step2, step3,
+                     step4, step5, step6, step7, in8, in9, in10, in11, in12,
+                     in13, in14, in15);
+
+  __lsx_vst(step0, output, 0);
+  __lsx_vst(step1, output, 16);
+  __lsx_vst(step2, output, 32);
+  __lsx_vst(step3, output, 48);
+  __lsx_vst(step4, output, 64);
+  __lsx_vst(step5, output, 80);
+  __lsx_vst(step6, output, 96);
+  __lsx_vst(step7, output, 112);
+
+  __lsx_vst(in8, output, 384);
+  __lsx_vst(in9, output, 400);
+  __lsx_vst(in10, output, 416);
+  __lsx_vst(in11, output, 432);
+  __lsx_vst(in12, output, 448);
+  __lsx_vst(in13, output, 464);
+  __lsx_vst(in14, output, 480);
+  __lsx_vst(in15, output, 496);
+
+  /* 2nd set */
+  DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff,
+            208, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400,
+            temp_buff, 464, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff,
+            224, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416,
+            temp_buff, 480, in12, in13, in14, in15);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, step0, step1, step2, step3,
+                     step4, step5, step6, step7, in8, in9, in10, in11, in12,
+                     in13, in14, in15);
+
+  __lsx_vst(step0, output, 128);
+  __lsx_vst(step1, output, 144);
+  __lsx_vst(step2, output, 160);
+  __lsx_vst(step3, output, 176);
+  __lsx_vst(step4, output, 192);
+  __lsx_vst(step5, output, 208);
+  __lsx_vst(step6, output, 224);
+  __lsx_vst(step7, output, 240);
+
+  __lsx_vst(in8, output, 256);
+  __lsx_vst(in9, output, 272);
+  __lsx_vst(in10, output, 288);
+  __lsx_vst(in11, output, 304);
+  __lsx_vst(in12, output, 320);
+  __lsx_vst(in13, output, 336);
+  __lsx_vst(in14, output, 352);
+  __lsx_vst(in15, output, 368);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+                                    int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+  __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+  __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+            in13, in14, in15);
+
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+
+  __lsx_vst(vec0, interm_ptr, 0);
+  __lsx_vst(vec1, interm_ptr, 16);
+  __lsx_vst(vec2, interm_ptr, 32);
+  __lsx_vst(vec3, interm_ptr, 48);
+  __lsx_vst(vec4, interm_ptr, 64);
+  __lsx_vst(vec5, interm_ptr, 80);
+  __lsx_vst(vec6, interm_ptr, 96);
+  __lsx_vst(vec7, interm_ptr, 112);
+
+  __lsx_vst(in8, interm_ptr, 128);
+  __lsx_vst(in9, interm_ptr, 144);
+  __lsx_vst(in10, interm_ptr, 160);
+  __lsx_vst(in11, interm_ptr, 176);
+  __lsx_vst(in12, interm_ptr, 192);
+  __lsx_vst(in13, interm_ptr, 208);
+  __lsx_vst(in14, interm_ptr, 224);
+  __lsx_vst(in15, interm_ptr, 240);
+
+  /* Stage 3 */
+  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+  DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r,
+            vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+  LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r,
+                    vec5_r);
+  DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l,
+            vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r);
+
+  tmp3_w = __lsx_vadd_w(vec0_r, vec3_r);
+  vec0_r = __lsx_vsub_w(vec0_r, vec3_r);
+  vec3_r = __lsx_vadd_w(vec1_r, vec2_r);
+  vec1_r = __lsx_vsub_w(vec1_r, vec2_r);
+
+  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 16);
+
+  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  __lsx_vst(vec5, out, 32);
+  __lsx_vst(vec4, out, 48);
+
+  DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32,
+            interm_ptr, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96,
+            interm_ptr, 112, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 64);
+  __lsx_vst(in5, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 80);
+  __lsx_vst(in5, out, 96);
+
+  DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160,
+            interm_ptr, 176, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224,
+            interm_ptr, 240, in12, in13, in14, in15);
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 128);
+  __lsx_vst(in5, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 144);
+  __lsx_vst(in5, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  tmp0_w = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 160);
+  __lsx_vst(in5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 192);
+  __lsx_vst(in5, out, 176);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+            in7);
+  DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+            in14, in15);
+
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 0);
+  __lsx_vst(temp1, out, 16);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 32);
+  __lsx_vst(temp1, out, 48);
+
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 64);
+  __lsx_vst(temp1, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 80);
+  __lsx_vst(temp1, out, 96);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 128);
+  __lsx_vst(temp1, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 144);
+  __lsx_vst(temp1, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5)
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 160);
+  __lsx_vst(temp1, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 192);
+  __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+                                int16_t *out) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+  __m128i tmp0, tmp1;
+
+  in20 = __lsx_vld(temp, 64);
+  in21 = __lsx_vld(temp, 80);
+  in26 = __lsx_vld(temp, 160);
+  in27 = __lsx_vld(temp, 176);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = __lsx_vld(temp, 32);
+  in19 = __lsx_vld(temp, 48);
+  in28 = __lsx_vld(temp, 192);
+  in29 = __lsx_vld(temp, 208);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, interm_ptr, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, interm_ptr, 176);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, interm_ptr, 112);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, interm_ptr, 128);
+
+  DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+            in20, in27, in26);
+
+  in22 = __lsx_vld(temp, 96);
+  in23 = __lsx_vld(temp, 112);
+  in24 = __lsx_vld(temp, 128);
+  in25 = __lsx_vld(temp, 144);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = __lsx_vld(temp, 0);
+  in17 = __lsx_vld(temp, 16);
+  in30 = __lsx_vld(temp, 224);
+  in31 = __lsx_vld(temp, 240);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, interm_ptr, 80);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, interm_ptr, 96);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, interm_ptr, 144);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 224);
+  __lsx_vst(vec4, out, 16);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 32);
+  __lsx_vst(vec5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 48);
+  __lsx_vst(vec5, out, 192);
+
+  in20 = __lsx_vld(interm_ptr, 64);
+  in21 = __lsx_vld(interm_ptr, 176);
+  in27 = __lsx_vld(interm_ptr, 112);
+  in26 = __lsx_vld(interm_ptr, 128);
+
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = __lsx_vld(interm_ptr, 80);
+  in25 = __lsx_vld(interm_ptr, 96);
+  in24 = __lsx_vld(interm_ptr, 144);
+  in23 = __lsx_vld(interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 64);
+  __lsx_vst(vec4, out, 176);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 80);
+  __lsx_vst(vec4, out, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 144);
+  __lsx_vst(vec4, out, 96);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 112);
+  __lsx_vst(vec5, out, 128);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+  /* 1st set */
+  in0 = __lsx_vld(temp, 0);
+  in4 = __lsx_vld(temp, 64);
+  in2 = __lsx_vld(temp, 128);
+  in6 = __lsx_vld(temp, 192);
+  in1 = __lsx_vld(temp, 256);
+  in7 = __lsx_vld(temp, 304);
+  in3 = __lsx_vld(temp, 384);
+  in5 = __lsx_vld(temp, 432);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  /* 2nd set */
+  in0_1 = __lsx_vld(temp, 32);
+  in1_1 = __lsx_vld(temp, 464);
+  in2_1 = __lsx_vld(temp, 160);
+  in3_1 = __lsx_vld(temp, 336);
+  in4_1 = __lsx_vld(temp, 96);
+  in5_1 = __lsx_vld(temp, 352);
+  in6_1 = __lsx_vld(temp, 224);
+  in7_1 = __lsx_vld(temp, 480);
+
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in1, output, 64);
+  __lsx_vst(in2, output, 128);
+  __lsx_vst(in3, output, 192);
+  __lsx_vst(in4, output, 256);
+  __lsx_vst(in5, output, 320);
+  __lsx_vst(in6, output, 384);
+  __lsx_vst(in7, output, 448);
+
+  LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  /* 3rd set */
+  in0 = __lsx_vld(temp, 16);
+  in1 = __lsx_vld(temp, 272);
+  in2 = __lsx_vld(temp, 144);
+  in3 = __lsx_vld(temp, 400);
+  in4 = __lsx_vld(temp, 80);
+  in5 = __lsx_vld(temp, 416);
+  in6 = __lsx_vld(temp, 208);
+  in7 = __lsx_vld(temp, 288);
+
+  __lsx_vst(in0_1, output, 16);
+  __lsx_vst(in1_1, output, 80);
+  __lsx_vst(in2_1, output, 144);
+  __lsx_vst(in3_1, output, 208);
+  __lsx_vst(in4_1, output, 272);
+  __lsx_vst(in5_1, output, 336);
+  __lsx_vst(in6_1, output, 400);
+  __lsx_vst(in7_1, output, 464);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in1, output, 96);
+  __lsx_vst(in2, output, 160);
+  __lsx_vst(in3, output, 224);
+  __lsx_vst(in4, output, 288);
+  __lsx_vst(in5, output, 352);
+  __lsx_vst(in6, output, 416);
+  __lsx_vst(in7, output, 480);
+
+  /* 4th set */
+  in0_1 = __lsx_vld(temp, 48);
+  in1_1 = __lsx_vld(temp, 448);
+  in2_1 = __lsx_vld(temp, 176);
+  in3_1 = __lsx_vld(temp, 320);
+  in4_1 = __lsx_vld(temp, 112);
+  in5_1 = __lsx_vld(temp, 368);
+  in6_1 = __lsx_vld(temp, 240);
+  in7_1 = __lsx_vld(temp, 496);
+
+  LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  __lsx_vst(in0_1, output, 48);
+  __lsx_vst(in1_1, output, 112);
+  __lsx_vst(in2_1, output, 176);
+  __lsx_vst(in3_1, output, 240);
+  __lsx_vst(in4_1, output, 304);
+  __lsx_vst(in5_1, output, 368);
+  __lsx_vst(in6_1, output, 432);
+  __lsx_vst(in7_1, output, 496);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+  fdct8x32_1d_row_even(temp_buf, temp_buf);
+  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+  fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+                       tmp_buf_big + (8 * i));
+  }
+
+  /* row transform */
+  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+  /* row transform */
+  for (i = 1; i < 4; ++i) {
+    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+  }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+            in7);
+  DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+            in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+
+  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+
+  temp0 = __lsx_vadd_h(in0, in3);
+  in0 = __lsx_vsub_h(in0, in3);
+  in3 = __lsx_vadd_h(in1, in2);
+  in1 = __lsx_vsub_h(in1, in2);
+
+  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+  __lsx_vst(temp0, out, 0);
+  __lsx_vst(temp1, out, 16);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  __lsx_vst(temp0, out, 32);
+  __lsx_vst(temp1, out, 48);
+
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  __lsx_vst(temp0, out, 64);
+  __lsx_vst(temp1, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  __lsx_vst(temp0, out, 80);
+  __lsx_vst(temp1, out, 96);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  __lsx_vst(temp0, out, 128);
+  __lsx_vst(temp1, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  __lsx_vst(temp0, out, 144);
+  __lsx_vst(temp1, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  __lsx_vst(temp0, out, 160);
+  __lsx_vst(temp1, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  __lsx_vst(temp0, out, 192);
+  __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+                                   int16_t *out) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i vec4, vec5, tmp0, tmp1;
+
+  in20 = __lsx_vld(temp, 64);
+  in21 = __lsx_vld(temp, 80);
+  in26 = __lsx_vld(temp, 160);
+  in27 = __lsx_vld(temp, 176);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  FDCT_POSTPROC_2V_NEG_H(in20, in21);
+  FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+  in18 = __lsx_vld(temp, 32);
+  in19 = __lsx_vld(temp, 48);
+  in28 = __lsx_vld(temp, 192);
+  in29 = __lsx_vld(temp, 208);
+
+  FDCT_POSTPROC_2V_NEG_H(in18, in19);
+  FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, interm_ptr, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, interm_ptr, 176);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, interm_ptr, 128);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, interm_ptr, 112);
+
+  DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+            in20, in27, in26);
+
+  in22 = __lsx_vld(temp, 96);
+  in23 = __lsx_vld(temp, 112);
+  in24 = __lsx_vld(temp, 128);
+  in25 = __lsx_vld(temp, 144);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+  FDCT_POSTPROC_2V_NEG_H(in22, in23);
+  FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+  in16 = __lsx_vld(temp, 0);
+  in17 = __lsx_vld(temp, 16);
+  in30 = __lsx_vld(temp, 224);
+  in31 = __lsx_vld(temp, 240);
+
+  FDCT_POSTPROC_2V_NEG_H(in16, in17);
+  FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, interm_ptr, 80);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, interm_ptr, 96);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, interm_ptr, 144);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  __lsx_vst(vec5, out, 224);
+  __lsx_vst(vec4, out, 16);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  __lsx_vst(vec4, out, 32);
+  __lsx_vst(vec5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  __lsx_vst(vec4, out, 48);
+  __lsx_vst(vec5, out, 192);
+
+  in20 = __lsx_vld(interm_ptr, 64);
+  in21 = __lsx_vld(interm_ptr, 176);
+  in27 = __lsx_vld(interm_ptr, 112);
+  in26 = __lsx_vld(interm_ptr, 128);
+
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = __lsx_vld(interm_ptr, 80);
+  in25 = __lsx_vld(interm_ptr, 96);
+  in24 = __lsx_vld(interm_ptr, 144);
+  in23 = __lsx_vld(interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  in16 = __lsx_vadd_h(in28, in29);
+  in19 = __lsx_vadd_h(in31, in30);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  __lsx_vst(vec5, out, 64);
+  __lsx_vst(vec4, out, 176);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  __lsx_vst(vec5, out, 80);
+  __lsx_vst(vec4, out, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  __lsx_vst(vec5, out, 144);
+  __lsx_vst(vec4, out, 96);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  __lsx_vst(vec4, out, 112);
+  __lsx_vst(vec5, out, 128);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out,
+                          int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+                       &tmp_buf_big[0] + (8 * i));
+  }
+  /* row transform */
+  for (i = 0; i < 4; ++i) {
+    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+                       out + (8 * i * 32));
+  }
+}
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
new file mode 100644
index 0000000000..0e59852c42
--- /dev/null
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+
+#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+  {                                          \
+    __m128i tp0_m, tp1_m;                    \
+    __m128i one = __lsx_vreplgr2vr_h(1);     \
+                                             \
+    tp0_m = __lsx_vslei_h(vec0, 0);          \
+    tp1_m = __lsx_vslei_h(vec1, 0);          \
+    tp0_m = __lsx_vxori_b(tp0_m, 255);       \
+    tp1_m = __lsx_vxori_b(tp1_m, 255);       \
+    vec0 = __lsx_vadd_h(vec0, one);          \
+    vec1 = __lsx_vadd_h(vec1, one);          \
+    tp0_m = __lsx_vand_v(one, tp0_m);        \
+    tp1_m = __lsx_vand_v(one, tp1_m);        \
+    vec0 = __lsx_vadd_h(vec0, tp0_m);        \
+    vec1 = __lsx_vadd_h(vec1, tp1_m);        \
+    vec0 = __lsx_vsrai_h(vec0, 2);           \
+    vec1 = __lsx_vsrai_h(vec1, 2);           \
+  }
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+  {                                        \
+    __m128i tp0_m, tp1_m;                  \
+    __m128i one_m = __lsx_vldi(0x401);     \
+                                           \
+    tp0_m = __lsx_vslti_h(vec0, 0);        \
+    tp1_m = __lsx_vslti_h(vec1, 0);        \
+    vec0 = __lsx_vadd_h(vec0, one_m);      \
+    vec1 = __lsx_vadd_h(vec1, one_m);      \
+    tp0_m = __lsx_vand_v(one_m, tp0_m);    \
+    tp1_m = __lsx_vand_v(one_m, tp1_m);    \
+    vec0 = __lsx_vadd_h(vec0, tp0_m);      \
+    vec1 = __lsx_vadd_h(vec1, tp1_m);      \
+    vec0 = __lsx_vsrai_h(vec0, 2);         \
+    vec1 = __lsx_vsrai_h(vec1, 2);         \
+  }
+
+#define FDCT32_POSTPROC_NEG_W(vec)         \
+  {                                        \
+    __m128i temp_m;                        \
+    __m128i one_m = __lsx_vreplgr2vr_w(1); \
+                                           \
+    temp_m = __lsx_vslti_w(vec, 0);        \
+    vec = __lsx_vadd_w(vec, one_m);        \
+    temp_m = __lsx_vand_v(one_m, temp_m);  \
+    vec = __lsx_vadd_w(vec, temp_m);       \
+    vec = __lsx_vsrai_w(vec, 2);           \
+  }
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right,       \
+                          const0, const1, out0, out1, out2, out3)             \
+  {                                                                           \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                   \
+    __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1;                         \
+    __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0);                       \
+                                                                              \
+    s0_m = __lsx_vreplgr2vr_w((int32_t)const1);                               \
+    k0_m = __lsx_vpackev_w(s0_m, k0_m);                                       \
+                                                                              \
+    DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1);             \
+    s1_m = __lsx_vilvl_w(_tmp0, reg0_left);                                   \
+    s0_m = __lsx_vilvh_w(_tmp0, reg0_left);                                   \
+    s3_m = __lsx_vilvl_w(reg0_left, reg1_left);                               \
+    s2_m = __lsx_vilvh_w(reg0_left, reg1_left);                               \
+    s5_m = __lsx_vilvl_w(_tmp1, reg0_right);                                  \
+    s4_m = __lsx_vilvh_w(_tmp1, reg0_right);                                  \
+    s7_m = __lsx_vilvl_w(reg0_right, reg1_right);                             \
+    s6_m = __lsx_vilvh_w(reg0_right, reg1_right);                             \
+    DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m);          \
+    DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m);          \
+    DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+              DCT_CONST_BITS, out0, out1);                                    \
+    DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m);          \
+    DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m);          \
+    DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+              DCT_CONST_BITS, out2, out3);                                    \
+  }
+
+#endif  // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h
new file mode 100644
index 0000000000..bc6f7dacc9
--- /dev/null
+++ b/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
+  {                                                           \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;               \
+    __m128i k0_m, k1_m, k2_m, k3_m;                           \
+    __m128i zero = __lsx_vldi(0);                             \
+                                                              \
+    k0_m = __lsx_vreplgr2vr_h(cnst0);                         \
+    k1_m = __lsx_vreplgr2vr_h(cnst1);                         \
+    k2_m = __lsx_vpackev_h(k1_m, k0_m);                       \
+    k0_m = __lsx_vpackev_h(zero, k0_m);                       \
+    k1_m = __lsx_vpackev_h(k1_m, zero);                       \
+                                                              \
+    s5_m = __lsx_vilvl_h(reg1, reg0);                         \
+    s4_m = __lsx_vilvh_h(reg1, reg0);                         \
+    s3_m = __lsx_vilvl_h(reg0, reg1);                         \
+    s2_m = __lsx_vilvh_h(reg0, reg1);                         \
+                                                              \
+    s1_m = __lsx_vdp2_w_h(s5_m, k0_m);                        \
+    s0_m = __lsx_vdp2_w_h(s4_m, k0_m);                        \
+    k3_m = __lsx_vdp2_w_h(s5_m, k1_m);                        \
+    s1_m = __lsx_vsub_w(s1_m, k3_m);                          \
+    k3_m = __lsx_vdp2_w_h(s4_m, k1_m);                        \
+    s0_m = __lsx_vsub_w(s0_m, k3_m);                          \
+                                                              \
+    out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);    \
+                                                              \
+    s1_m = __lsx_vdp2_w_h(s3_m, k2_m);                        \
+    s0_m = __lsx_vdp2_w_h(s2_m, k2_m);                        \
+    out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);    \
+  }
+
+#endif  // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 198e0060f7..6da133e383 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -211,6 +211,7 @@ endif # CONFIG_VP9
 DSP_SRCS-yes            += txfm_common.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/txfm_macros_lsx.h
 # forward transform
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += fwd_txfm.c
@@ -231,9 +232,11 @@ DSP_SRCS-$(HAVE_NEON)   += arm/fdct_partial_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.h
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_dct32x32_lsx.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 DSP_SRCS-$(HAVE_VSX)    += ppc/fdct32x32_vsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index bcc1b916cc..436bee94dd 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -591,10 +591,10 @@ ()
   specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx/;
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx lsx/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct32x32_1 sse2 neon msa/;

From 81e5841a167e351613ce8ffc068d7ebe83e666a1 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Thu, 14 Apr 2022 11:30:55 +0800
Subject: [PATCH 253/926] vp9[loongarch]: Optimize idct32x32_1024/1/34_add

1. vpx_idct32x32_1024_add_lsx
2. vpx_idct32x32_34_add_lsx
3. vpx_idct32x32_1_add_lsx

Bug: webm:1755

Change-Id: I9c24f75e0d93613754d8e30da7e007b8d1374e60
---
 test/dct32x32_test.cc             |   6 +-
 test/partial_idct_test.cc         |  14 +
 vpx_dsp/loongarch/fwd_txfm_lsx.h  |  22 +
 vpx_dsp/loongarch/idct32x32_lsx.c | 834 ++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk                |   2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl      |   6 +-
 6 files changed, 878 insertions(+), 6 deletions(-)
 create mode 100644 vpx_dsp/loongarch/idct32x32_lsx.c

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index a764d187a3..91bb8e01ea 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -400,9 +400,9 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     LSX, Trans32x32Test,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_lsx, &vpx_idct32x32_1024_add_c,
-                                 0, VPX_BITS_8),
+    ::testing::Values(make_tuple(&vpx_fdct32x32_lsx,
+                                 &vpx_idct32x32_1024_add_lsx, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_lsx,
-                                 &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+                                 &vpx_idct32x32_1024_add_lsx, 1, VPX_BITS_8)));
 #endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index a160120de1..7eb888a586 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -954,6 +954,20 @@ INSTANTIATE_TEST_SUITE_P(MSA, PartialIDctTest,
                          ::testing::ValuesIn(msa_partial_idct_tests));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam lsx_partial_idct_tests[] = {
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_lsx>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_lsx>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
+             &wrapper<vpx_idct32x32_1_add_lsx>, TX_32X32, 1, 8, 1),
+};
+
+INSTANTIATE_TEST_SUITE_P(LSX, PartialIDctTest,
+                         ::testing::ValuesIn(lsx_partial_idct_tests));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !CONFIG_EMULATE_HARDWARE
 
 }  // namespace
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
index 0e59852c42..a6f62dbc81 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.h
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -91,4 +91,26 @@
               DCT_CONST_BITS, out2, out3);                                    \
   }
 
+#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2,   \
+                            in3)                                               \
+  {                                                                            \
+    __m128i dst0_m, dst1_m, dst2_m, dst3_m;                                    \
+    __m128i tmp0_m, tmp1_m;                                                    \
+    __m128i res0_m, res1_m, res2_m, res3_m;                                    \
+                                                                               \
+    dst0_m = __lsx_vld(dst, 0);                                                \
+    DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m);        \
+    dst3_m = __lsx_vldx(dst, _stride3);                                        \
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \
+              res0_m, res1_m, res2_m, res3_m);                                 \
+    DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m,     \
+              in3, res0_m, res1_m, res2_m, res3_m);                            \
+    DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0,       \
+              tmp0_m, tmp1_m);                                                 \
+    __lsx_vstelm_d(tmp0_m, dst, 0, 0);                                         \
+    __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1);                               \
+    __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0);                              \
+    __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1);                              \
+  }
+
 #endif  // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/vpx_dsp/loongarch/idct32x32_lsx.c b/vpx_dsp/loongarch/idct32x32_lsx.c
new file mode 100644
index 0000000000..d6890c28e1
--- /dev/null
+++ b/vpx_dsp/loongarch/idct32x32_lsx.c
@@ -0,0 +1,834 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define UNPCK_UB_SH(_in, _out0, _out1)   \
+  {                                      \
+    _out0 = __lsx_vsllwil_hu_bu(_in, 0); \
+    _out1 = __lsx_vexth_hu_bu(_in);      \
+  }
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+                                         int16_t *tmp_buf) {
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* 1st & 2nd 8x8 */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1,
+            n1);
+  DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2,
+            m3, n3);
+  DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5,
+            n5);
+  DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6,
+            m7, n7);
+
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+
+  __lsx_vst(m0, tmp_buf, 0);
+  __lsx_vst(n0, tmp_buf, 16);
+  __lsx_vst(m1, tmp_buf, 32);
+  __lsx_vst(n1, tmp_buf, 48);
+  __lsx_vst(m2, tmp_buf, 64);
+  __lsx_vst(n2, tmp_buf, 80);
+  __lsx_vst(m3, tmp_buf, 96);
+  __lsx_vst(n3, tmp_buf, 112);
+  __lsx_vst(m4, tmp_buf, 128);
+  __lsx_vst(n4, tmp_buf, 144);
+  __lsx_vst(m5, tmp_buf, 160);
+  __lsx_vst(n5, tmp_buf, 176);
+  __lsx_vst(m6, tmp_buf, 192);
+  __lsx_vst(n6, tmp_buf, 208);
+  __lsx_vst(m7, tmp_buf, 224);
+  __lsx_vst(n7, tmp_buf, 240);
+
+  /* 3rd & 4th 8x8 */
+  DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1,
+            n1);
+  DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2,
+            m3, n3);
+  DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4,
+            m5, n5);
+  DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6,
+            m7, n7);
+
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+
+  __lsx_vst(m0, tmp_buf, 256);
+  __lsx_vst(n0, tmp_buf, 272);
+  __lsx_vst(m1, tmp_buf, 288);
+  __lsx_vst(n1, tmp_buf, 304);
+  __lsx_vst(m2, tmp_buf, 320);
+  __lsx_vst(n2, tmp_buf, 336);
+  __lsx_vst(m3, tmp_buf, 352);
+  __lsx_vst(n3, tmp_buf, 368);
+  __lsx_vst(m4, tmp_buf, 384);
+  __lsx_vst(n4, tmp_buf, 400);
+  __lsx_vst(m5, tmp_buf, 416);
+  __lsx_vst(n5, tmp_buf, 432);
+  __lsx_vst(m6, tmp_buf, 448);
+  __lsx_vst(n6, tmp_buf, 464);
+  __lsx_vst(m7, tmp_buf, 480);
+  __lsx_vst(n7, tmp_buf, 496);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+                                            int16_t *tmp_eve_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+  __m128i tmp0;
+
+  /* Even stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480,
+            reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = __lsx_vadd_h(reg0, reg4);
+  reg0 = __lsx_vsub_h(reg0, reg4);
+  reg4 = __lsx_vadd_h(reg6, reg2);
+  reg6 = __lsx_vsub_h(reg6, reg2);
+  reg2 = __lsx_vadd_h(reg1, reg5);
+  reg1 = __lsx_vsub_h(reg1, reg5);
+  reg5 = __lsx_vadd_h(reg7, reg3);
+  reg7 = __lsx_vsub_h(reg7, reg3);
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = __lsx_vadd_h(reg3, reg4);
+  reg3 = __lsx_vsub_h(reg3, reg4);
+  reg4 = __lsx_vsub_h(reg5, vec1);
+  reg5 = __lsx_vadd_h(reg5, vec1);
+
+  tmp0 = __lsx_vneg_h(reg6);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = __lsx_vsub_h(reg0, reg6);
+  reg0 = __lsx_vadd_h(reg0, reg6);
+  vec1 = __lsx_vsub_h(reg7, reg1);
+  reg7 = __lsx_vadd_h(reg7, reg1);
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 240);
+  __lsx_vst(loc1, tmp_eve_buf, 0);
+  __lsx_vst(loc2, tmp_eve_buf, 224);
+  __lsx_vst(loc3, tmp_eve_buf, 16);
+
+  LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 208);
+  __lsx_vst(loc1, tmp_eve_buf, 32);
+  __lsx_vst(loc2, tmp_eve_buf, 192);
+  __lsx_vst(loc3, tmp_eve_buf, 48);
+
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 176);
+  __lsx_vst(loc1, tmp_eve_buf, 64);
+  __lsx_vst(loc2, tmp_eve_buf, 160);
+  __lsx_vst(loc3, tmp_eve_buf, 80);
+
+  LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 144);
+  __lsx_vst(loc1, tmp_eve_buf, 96);
+  __lsx_vst(loc2, tmp_eve_buf, 128);
+  __lsx_vst(loc3, tmp_eve_buf, 112);
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+                                           int16_t *tmp_odd_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = __lsx_vadd_h(reg0, reg3);
+  reg0 = __lsx_vsub_h(reg0, reg3);
+  reg3 = __lsx_vadd_h(reg7, reg4);
+  reg7 = __lsx_vsub_h(reg7, reg4);
+  reg4 = __lsx_vadd_h(reg1, reg2);
+  reg1 = __lsx_vsub_h(reg1, reg2);
+  reg2 = __lsx_vadd_h(reg6, reg5);
+  reg6 = __lsx_vsub_h(reg6, reg5);
+  reg5 = vec0;
+
+  /* 4 Stores */
+  DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 64);
+  __lsx_vst(vec1, tmp_odd_buf, 80);
+
+  DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 0);
+  __lsx_vst(vec1, tmp_odd_buf, 16);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 96);
+  __lsx_vst(vec1, tmp_odd_buf, 112);
+
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  __lsx_vst(vec2, tmp_odd_buf, 32);
+  __lsx_vst(vec3, tmp_odd_buf, 48);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+            vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+  LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 192);
+  __lsx_vst(vec1, tmp_odd_buf, 240);
+
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 160);
+  __lsx_vst(vec1, tmp_odd_buf, 176);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1,
+            vec2, vec0, vec3);
+  LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  __lsx_vst(reg0, tmp_odd_buf, 208);
+  __lsx_vst(reg1, tmp_odd_buf, 224);
+
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  __lsx_vst(reg0, tmp_odd_buf, 128);
+  __lsx_vst(reg1, tmp_odd_buf, 144);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+            tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+            tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 0);
+  __lsx_vst(loc1, tmp_odd_buf, 16);
+  __lsx_vst(loc2, tmp_odd_buf, 32);
+  __lsx_vst(loc3, tmp_odd_buf, 48);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 128);
+  __lsx_vst(loc1, tmp_odd_buf, 144);
+  __lsx_vst(loc2, tmp_odd_buf, 160);
+  __lsx_vst(loc3, tmp_odd_buf, 176);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+            tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+            tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 64);
+  __lsx_vst(loc1, tmp_odd_buf, 80);
+  __lsx_vst(loc2, tmp_odd_buf, 96);
+  __lsx_vst(loc3, tmp_odd_buf, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 192);
+  __lsx_vst(loc1, tmp_odd_buf, 208);
+  __lsx_vst(loc2, tmp_odd_buf, 224);
+  __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                           int16_t *tmp_eve_buf,
+                                           int16_t *tmp_odd_buf, int16_t *dst) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+  __m128i reg0, reg1, reg2, reg3;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+            tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+            tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+            m4, m2, m6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 496);
+  __lsx_vst(reg1, tmp_buf, 368);
+  __lsx_vst(reg2, tmp_buf, 432);
+  __lsx_vst(reg3, tmp_buf, 304);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+            tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+            tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+            m5, m3, m7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 464);
+  __lsx_vst(reg1, tmp_buf, 336);
+  __lsx_vst(reg2, tmp_buf, 400);
+  __lsx_vst(reg3, tmp_buf, 272);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+            tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+            tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+            n4, n2, n6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 480);
+  __lsx_vst(reg1, tmp_buf, 352);
+  __lsx_vst(reg2, tmp_buf, 416);
+  __lsx_vst(reg3, tmp_buf, 288);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+            tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+            tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+            n5, n3, n7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 448);
+  __lsx_vst(reg1, tmp_buf, 320);
+  __lsx_vst(reg2, tmp_buf, 384);
+  __lsx_vst(reg3, tmp_buf, 256);
+
+  /* Transpose : 16 vectors */
+  /* 1st & 2nd 8x8 */
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  __lsx_vst(m0, dst, 0);
+  __lsx_vst(n0, dst, 64);
+  __lsx_vst(m1, dst, 128);
+  __lsx_vst(n1, dst, 192);
+  __lsx_vst(m2, dst, 256);
+  __lsx_vst(n2, dst, 320);
+  __lsx_vst(m3, dst, 384);
+  __lsx_vst(n3, dst, 448);
+
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+  __lsx_vst(m4, dst, 16);
+  __lsx_vst(n4, dst, 80);
+  __lsx_vst(m5, dst, 144);
+  __lsx_vst(n5, dst, 208);
+  __lsx_vst(m6, dst, 272);
+  __lsx_vst(n6, dst, 336);
+  __lsx_vst(m7, dst, 400);
+  __lsx_vst(n7, dst, 464);
+
+  /* 3rd & 4th 8x8 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304,
+            m0, n0, m1, n1);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368,
+            m2, n2, m3, n3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432,
+            m4, n4, m5, n5);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496,
+            m6, n6, m7, n7);
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+  __lsx_vst(m0, dst, 32);
+  __lsx_vst(n0, dst, 96);
+  __lsx_vst(m1, dst, 160);
+  __lsx_vst(n1, dst, 224);
+  __lsx_vst(m2, dst, 288);
+  __lsx_vst(n2, dst, 352);
+  __lsx_vst(m3, dst, 416);
+  __lsx_vst(n3, dst, 480);
+  __lsx_vst(m4, dst, 48);
+  __lsx_vst(n4, dst, 112);
+  __lsx_vst(m5, dst, 176);
+  __lsx_vst(n5, dst, 240);
+  __lsx_vst(m6, dst, 304);
+  __lsx_vst(n6, dst, 368);
+  __lsx_vst(m7, dst, 432);
+  __lsx_vst(n7, dst, 496);
+}
+
+static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) {
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct32x8_row_transpose_store(input, &tmp_buf[0]);
+  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+                                 output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+  __m128i tmp0;
+
+  /* Even stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+            1792, reg4, reg5, reg6, reg7);
+  tmp_buf += 64;
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  /* Load 8 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+            1792, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = __lsx_vadd_h(reg0, reg4);
+  reg0 = __lsx_vsub_h(reg0, reg4);
+  reg4 = __lsx_vadd_h(reg6, reg2);
+  reg6 = __lsx_vsub_h(reg6, reg2);
+  reg2 = __lsx_vadd_h(reg1, reg5);
+  reg1 = __lsx_vsub_h(reg1, reg5);
+  reg5 = __lsx_vadd_h(reg7, reg3);
+  reg7 = __lsx_vsub_h(reg7, reg3);
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = __lsx_vadd_h(reg3, reg4);
+  reg3 = __lsx_vsub_h(reg3, reg4);
+  reg4 = __lsx_vsub_h(reg5, vec1);
+  reg5 = __lsx_vadd_h(reg5, vec1);
+
+  tmp0 = __lsx_vneg_h(reg6);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = __lsx_vsub_h(reg0, reg6);
+  reg0 = __lsx_vadd_h(reg0, reg6);
+  vec1 = __lsx_vsub_h(reg7, reg1);
+  reg7 = __lsx_vadd_h(reg7, reg1);
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 0);
+  __lsx_vst(loc3, tmp_eve_buf, 16);
+  __lsx_vst(loc2, tmp_eve_buf, 224);
+  __lsx_vst(loc0, tmp_eve_buf, 240);
+
+  LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 32);
+  __lsx_vst(loc3, tmp_eve_buf, 48);
+  __lsx_vst(loc2, tmp_eve_buf, 192);
+  __lsx_vst(loc0, tmp_eve_buf, 208);
+
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 64);
+  __lsx_vst(loc3, tmp_eve_buf, 80);
+  __lsx_vst(loc2, tmp_eve_buf, 160);
+  __lsx_vst(loc0, tmp_eve_buf, 176);
+
+  LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 96);
+  __lsx_vst(loc3, tmp_eve_buf, 112);
+  __lsx_vst(loc2, tmp_eve_buf, 128);
+  __lsx_vst(loc0, tmp_eve_buf, 144);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                              int16_t *tmp_odd_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf,
+            1984, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = __lsx_vadd_h(reg0, reg3);
+  reg0 = __lsx_vsub_h(reg0, reg3);
+  reg3 = __lsx_vadd_h(reg7, reg4);
+  reg7 = __lsx_vsub_h(reg7, reg4);
+  reg4 = __lsx_vadd_h(reg1, reg2);
+  reg1 = __lsx_vsub_h(reg1, reg2);
+  reg2 = __lsx_vadd_h(reg6, reg5);
+  reg6 = __lsx_vsub_h(reg6, reg5);
+  reg5 = vec0;
+
+  /* 4 Stores */
+  DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 64);
+  __lsx_vst(vec1, tmp_odd_buf, 80);
+  DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 0);
+  __lsx_vst(vec1, tmp_odd_buf, 16);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 96);
+  __lsx_vst(vec1, tmp_odd_buf, 112);
+  __lsx_vst(vec2, tmp_odd_buf, 32);
+  __lsx_vst(vec3, tmp_odd_buf, 48);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf,
+            1856, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+            vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+  LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+  __lsx_vst(vec0, tmp_odd_buf, 192);
+  __lsx_vst(vec1, tmp_odd_buf, 240);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 160);
+  __lsx_vst(vec1, tmp_odd_buf, 176);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0,
+            vec1, vec2, vec3);
+  LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  __lsx_vst(reg0, tmp_odd_buf, 208);
+  __lsx_vst(reg1, tmp_odd_buf, 224);
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  __lsx_vst(reg0, tmp_odd_buf, 128);
+  __lsx_vst(reg1, tmp_odd_buf, 144);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+            tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+            tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 0);
+  __lsx_vst(loc1, tmp_odd_buf, 16);
+  __lsx_vst(loc2, tmp_odd_buf, 32);
+  __lsx_vst(loc3, tmp_odd_buf, 48);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 128);
+  __lsx_vst(loc1, tmp_odd_buf, 144);
+  __lsx_vst(loc2, tmp_odd_buf, 160);
+  __lsx_vst(loc3, tmp_odd_buf, 176);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+            tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+            tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 64);
+  __lsx_vst(loc1, tmp_odd_buf, 80);
+  __lsx_vst(loc2, tmp_odd_buf, 96);
+  __lsx_vst(loc3, tmp_odd_buf, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 192);
+  __lsx_vst(loc1, tmp_odd_buf, 208);
+  __lsx_vst(loc2, tmp_odd_buf, 224);
+  __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                             int16_t *tmp_odd_buf, uint8_t *dst,
+                                             int32_t dst_stride) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+  int32_t stride = dst_stride << 2;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride + stride2;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+            tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+            tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+            m4, m2, m6);
+  DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+  VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6,
+            m2, m4, m0);
+  DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+  VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2,
+                      m4, m6);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+            tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+            tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+            m5, m3, m7);
+  DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+  VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3,
+                      m5, m7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7,
+            m3, m5, m1);
+  DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+  VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3,
+                      m5, m7);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+            tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+            tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+            n4, n2, n6);
+  DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+  VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4,
+                      n6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6,
+            n2, n4, n0);
+  DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+  VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2,
+                      n4, n6);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+            tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+            tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+            n5, n3, n7);
+  DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+  VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3,
+                      n5, n7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7,
+            n3, n5, n1);
+  DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+  VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3,
+                      n5, n7);
+}
+
+static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride) {
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+                                   dst_stride);
+}
+
+void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst,
+                                int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 4; ++i) {
+    /* process 32 * 8 block */
+    idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8)));
+  }
+
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+  __m128i zero = __lsx_vldi(0);
+
+  for (i = 32; i--;) {
+    __lsx_vst(zero, out_ptr, 0);
+    __lsx_vst(zero, out_ptr, 16);
+    __lsx_vst(zero, out_ptr, 32);
+    __lsx_vst(zero, out_ptr, 48);
+    out_ptr += 32;
+  }
+
+  out_ptr = out_arr;
+
+  /* rows: only upper-left 8x8 has non-zero coeff */
+  idct32x8_1d_rows_lsx(input, out_ptr);
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst,
+                             int32_t dst_stride) {
+  int32_t i;
+  int16_t out;
+  __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 6);
+
+  vec = __lsx_vreplgr2vr_h(out);
+
+  for (i = 16; i--;) {
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    dst2 = __lsx_vldx(dst, dst_stride);
+    dst3 = __lsx_vldx(dst + 16, dst_stride);
+
+    UNPCK_UB_SH(dst0, res0, res4);
+    UNPCK_UB_SH(dst1, res1, res5);
+    UNPCK_UB_SH(dst2, res2, res6);
+    UNPCK_UB_SH(dst3, res3, res7);
+
+    DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+              res1, res2, res3);
+    DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4,
+              res5, res6, res7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0,
+              res7, res3, 0, tmp0, tmp1, tmp2, tmp3);
+    __lsx_vst(tmp0, dst, 0);
+    __lsx_vst(tmp1, dst, 16);
+    dst += dst_stride;
+    __lsx_vst(tmp2, dst, 0);
+    __lsx_vst(tmp3, dst, 16);
+    dst += dst_stride;
+  }
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 6da133e383..3eba23c0af 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -269,6 +269,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+
+DSP_SRCS-$(HAVE_LSX)   += loongarch/idct32x32_lsx.c
 else  # CONFIG_VP9_HIGHBITDEPTH
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 436bee94dd..7d78dc72ac 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -652,12 +652,12 @@ ()
     $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
     specialize qw/vpx_idct16x16_10_add dspr2 msa/;
     specialize qw/vpx_idct16x16_1_add dspr2 msa/;
-    specialize qw/vpx_idct32x32_1024_add dspr2 msa/;
+    specialize qw/vpx_idct32x32_1024_add dspr2 msa lsx/;
     specialize qw/vpx_idct32x32_135_add dspr2 msa/;
     $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
     $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
-    specialize qw/vpx_idct32x32_34_add dspr2 msa/;
-    specialize qw/vpx_idct32x32_1_add dspr2 msa/;
+    specialize qw/vpx_idct32x32_34_add dspr2 msa lsx/;
+    specialize qw/vpx_idct32x32_1_add dspr2 msa lsx/;
     specialize qw/vpx_iwht4x4_16_add msa/;
     specialize qw/vpx_iwht4x4_1_add msa/;
   } # !CONFIG_VP9_HIGHBITDEPTH

From c8b9bf2b289a5755c0cc1187ebf07e7af75ef37d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 15 Apr 2022 22:23:05 -0700
Subject: [PATCH 254/926] vp8: fix some implicit unsigned -> int conversions

fixes some warnings with clang-13 -fsanitize=integer:
vp8/decoder/threading.c:77:27: runtime error: implicit conversion
from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type
'int' changed the value to -1 (32-bit, signed)

these bitmask constants were missed in:
1676cddaa vp8: fix some implicit signed -> unsigned conv warnings

Bug: webm:1759
Change-Id: I5d894d08fd41e32b91b56a4d91276837b3415ee4
---
 vp8/decoder/threading.c  | 4 ++--
 vp8/encoder/ethreading.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 491e2ce4c1..490f62d1b3 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -74,9 +74,9 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
     memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
-    mbd->fullpixel_mask = 0xffffffff;
+    mbd->fullpixel_mask = ~0;
 
-    if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
+    if (pc->full_pixel) mbd->fullpixel_mask = ~7;
   }
 
   for (i = 0; i < pc->mb_rows; ++i)
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 55a1528b14..cb35f4f491 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -470,8 +470,8 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
     setup_mbby_copy(&mbr_ei[i].mb, x);
 
-    mbd->fullpixel_mask = 0xffffffff;
-    if (cm->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
+    mbd->fullpixel_mask = ~0;
+    if (cm->full_pixel) mbd->fullpixel_mask = ~7;
 
     vp8_zero(mb->coef_counts);
     vp8_zero(x->ymode_count);

From 9750257826bea4c73557f0612b24e9b85baf7031 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 15 Apr 2022 22:29:31 -0700
Subject: [PATCH 255/926] vp8,get_sub_mv_ref_prob: change arguments to uint32_t

this matches the call with int_mv::as_int and fixes a warning with
clang-13 -fsanitize=integer:
vp8/decoder/decodemv.c:240:32: runtime error: implicit conversion from
type 'uint32_t' (aka 'unsigned int') of value 4282515456 (32-bit,
unsigned) to type 'int' changed the value to -12451840 (32-bit, signed)

Bug: webm:1759
Change-Id: I7c0aa72baa45421929afac26566e149adc6669d7
---
 vp8/decoder/decodemv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 94373852dc..51817a2cb9 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -173,7 +173,8 @@ const vp8_prob vp8_sub_mv_ref_prob3[8][VP8_SUBMVREFS - 1] = {
   { 208, 1, 1 }     /* SUBMVREF_LEFT_ABOVE_ZED  */
 };
 
-static const vp8_prob *get_sub_mv_ref_prob(const int left, const int above) {
+static const vp8_prob *get_sub_mv_ref_prob(const uint32_t left,
+                                           const uint32_t above) {
   int lez = (left == 0);
   int aez = (above == 0);
   int lea = (left == above);

From 946bcdf9069c980edd4edad5721262efe26f75ba Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 15 Apr 2022 16:45:26 -0700
Subject: [PATCH 256/926] Upgrade GoogleTest to v1.11.0

The release tag is release-1.11.0.

Ref: https://aomedia-review.googlesource.com/c/aom/+/156641
79c98a122 Upgrade GoogleTest to v1.11.0

Note the tree structure differs from libaom, but is left untouched to
avoid breaking test include paths in this commit.

Change-Id: Ia3c6861d45a3befc2decb1da5b1018bcfd38f95a
---
 third_party/googletest/README.libvpx          |   8 +-
 third_party/googletest/src/CONTRIBUTORS       |  25 +
 third_party/googletest/src/README.md          | 169 ++-
 .../src/include/gtest/gtest-death-test.h      |  45 +-
 .../src/include/gtest/gtest-matchers.h        | 372 +++++--
 .../src/include/gtest/gtest-message.h         |   6 +-
 .../src/include/gtest/gtest-param-test.h      |  14 +-
 .../src/include/gtest/gtest-printers.h        | 654 +++++++-----
 .../googletest/src/include/gtest/gtest-spi.h  |   6 +-
 .../src/include/gtest/gtest-test-part.h       |   6 +-
 .../src/include/gtest/gtest-typed-test.h      |  14 +-
 .../googletest/src/include/gtest/gtest.h      |  90 +-
 .../src/include/gtest/gtest_pred_impl.h       |   6 +-
 .../googletest/src/include/gtest/gtest_prod.h |   6 +-
 .../gtest/internal/custom/gtest-port.h        |   6 +-
 .../gtest/internal/custom/gtest-printers.h    |   6 +-
 .../src/include/gtest/internal/custom/gtest.h |   6 +-
 .../internal/gtest-death-test-internal.h      |   6 +-
 .../include/gtest/internal/gtest-filepath.h   |   8 +-
 .../include/gtest/internal/gtest-internal.h   | 266 +++--
 .../include/gtest/internal/gtest-param-util.h |  43 +-
 .../include/gtest/internal/gtest-port-arch.h  |   9 +-
 .../src/include/gtest/internal/gtest-port.h   | 175 +++-
 .../src/include/gtest/internal/gtest-string.h |   9 +-
 .../include/gtest/internal/gtest-type-util.h  |  48 +-
 .../googletest/src/src/gtest-death-test.cc    |  63 +-
 .../googletest/src/src/gtest-filepath.cc      |  45 +-
 .../googletest/src/src/gtest-internal-inl.h   |  35 +-
 third_party/googletest/src/src/gtest-port.cc  |  56 +-
 .../googletest/src/src/gtest-printers.cc      | 197 +++-
 .../googletest/src/src/gtest-typed-test.cc    |  16 +-
 third_party/googletest/src/src/gtest.cc       | 989 ++++++++++++------
 32 files changed, 2220 insertions(+), 1184 deletions(-)

diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx
index ed55fb09f9..b9a74922f0 100644
--- a/third_party/googletest/README.libvpx
+++ b/third_party/googletest/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest.git
-Version: release-1.10.0-224-g23b2a3b1
+Version: release-1.11.0
 License: BSD
 License File: LICENSE
 
@@ -13,11 +13,9 @@ generation.
 
 Local Modifications:
 - Remove everything but:
+  CONTRIBUTORS
   googletest/
-   CONTRIBUTORS
    include
-   LICENSE
    README.md
    src
-- Enable kErrorOnUninstantiatedParameterizedTest and
-  kErrorOnUninstantiatedTypeParameterizedTest in gtest.cc
+  LICENSE
diff --git a/third_party/googletest/src/CONTRIBUTORS b/third_party/googletest/src/CONTRIBUTORS
index 1e4afe2182..76db0b40ff 100644
--- a/third_party/googletest/src/CONTRIBUTORS
+++ b/third_party/googletest/src/CONTRIBUTORS
@@ -5,34 +5,59 @@
 
 Ajay Joshi <jaj@google.com>
 Balázs Dán <balazs.dan@gmail.com>
+Benoit Sigoure <tsuna@google.com>
 Bharat Mediratta <bharat@menalto.com>
+Bogdan Piloca <boo@google.com>
 Chandler Carruth <chandlerc@google.com>
 Chris Prince <cprince@google.com>
 Chris Taylor <taylorc@google.com>
 Dan Egnor <egnor@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
 Eric Roman <eroman@chromium.org>
+Gene Volovich <gv@cite.com>
 Hady Zalek <hady.zalek@gmail.com>
+Hal Burch <gmock@hburch.com>
 Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
 Jói Sigurðsson <joi@google.com>
 Keir Mierle <mierle@gmail.com>
 Keith Ray <keith.ray@gmail.com>
 Kenton Varda <kenton@google.com>
+Kostya Serebryany <kcc@google.com>
 Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
+Lev Makhlis
 Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
 Markus Heule <markus.heule@gmail.com>
+Matthew Simmons <simmonmt@acm.org>
 Mika Raento <mikie@iki.fi>
+Mike Bland <mbland@google.com>
 Miklós Fazekas <mfazekas@szemafor.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
 Pasi Valminen <pasi.valminen@gmail.com>
 Patrick Hanna <phanna@google.com>
 Patrick Riley <pfr@google.com>
+Paul Menage <menage@google.com>
 Peter Kaminski <piotrk@google.com>
+Piotr Kaminski <piotrk@google.com>
 Preston Jackson <preston.a.jackson@gmail.com>
 Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
 Russ Cox <rsc@google.com>
 Russ Rufer <russ@pentad.com>
 Sean Mcafee <eefacm@gmail.com>
 Sigurður Ásgeirsson <siggi@google.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
 Tracy Bialik <tracy@pentad.com>
 Vadim Berman <vadimb@google.com>
 Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
 Zhanyong Wan <wan@google.com>
diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md
index 904048f484..1f8b349ae7 100644
--- a/third_party/googletest/src/README.md
+++ b/third_party/googletest/src/README.md
@@ -2,39 +2,51 @@
 
 #### Setup
 
-To build Google Test and your tests that use it, you need to tell your build
+To build GoogleTest and your tests that use it, you need to tell your build
 system where to find its headers and source files. The exact way to do it
 depends on which build system you use, and is usually straightforward.
 
 ### Build with CMake
 
-Google Test comes with a CMake build script
+GoogleTest comes with a CMake build script
 ([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
 that can be used on a wide range of platforms ("C" stands for cross-platform.).
 If you don't have CMake installed already, you can download it for free from
 <http://www.cmake.org/>.
 
 CMake works by generating native makefiles or build projects that can be used in
-the compiler environment of your choice. You can either build Google Test as a
+the compiler environment of your choice. You can either build GoogleTest as a
 standalone project or it can be incorporated into an existing CMake build for
 another project.
 
 #### Standalone CMake Project
 
-When building Google Test as a standalone project, the typical workflow starts
-with:
+When building GoogleTest as a standalone project, the typical workflow starts
+with
 
-    mkdir mybuild       # Create a directory to hold the build output.
-    cd mybuild
-    cmake ${GTEST_DIR}  # Generate native build scripts.
+```
+git clone https://github.com/google/googletest.git -b release-1.10.0
+cd googletest        # Main directory of the cloned repository.
+mkdir build          # Create a directory to hold the build output.
+cd build
+cmake ..             # Generate native build scripts for GoogleTest.
+```
 
-If you want to build Google Test's samples, you should replace the last command
-with
+The above command also includes GoogleMock by default. And so, if you want to
+build only GoogleTest, you should replace the last command with
 
-    cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+```
+cmake .. -DBUILD_GMOCK=OFF
+```
 
 If you are on a \*nix system, you should now see a Makefile in the current
-directory. Just type 'make' to build gtest.
+directory. Just type `make` to build GoogleTest. And then you can simply install
+GoogleTest if you are a system administrator.
+
+```
+make
+sudo make install    # Install in /usr/local/ by default
+```
 
 If you use Windows and have Visual Studio installed, a `gtest.sln` file and
 several `.vcproj` files will be created. You can then build them using Visual
@@ -44,13 +56,19 @@ On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
 
 #### Incorporating Into An Existing CMake Project
 
-If you want to use gtest in a project which already uses CMake, then a more
-robust and flexible approach is to build gtest as part of that project directly.
-This is done by making the GoogleTest source code available to the main build
-and adding it using CMake's `add_subdirectory()` command. This has the
-significant advantage that the same compiler and linker settings are used
-between gtest and the rest of your project, so issues associated with using
-incompatible libraries (eg debug/release), etc. are avoided. This is
+If you want to use GoogleTest in a project which already uses CMake, the easiest
+way is to get installed libraries and headers.
+
+*   Import GoogleTest by using `find_package` (or `pkg_check_modules`). For
+    example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the
+    libraries as `GTest::gtest`, `GTest::gmock`.
+
+And a more robust and flexible approach is to build GoogleTest as part of that
+project directly. This is done by making the GoogleTest source code available to
+the main build and adding it using CMake's `add_subdirectory()` command. This
+has the significant advantage that the same compiler and linker settings are
+used between GoogleTest and the rest of your project, so issues associated with
+using incompatible libraries (eg debug/release), etc. are avoided. This is
 particularly useful on Windows. Making GoogleTest's source code available to the
 main build can be done a few different ways:
 
@@ -64,68 +82,23 @@ main build can be done a few different ways:
     possible or appropriate. Git submodules, for example, have their own set of
     advantages and drawbacks.
 *   Use CMake to download GoogleTest as part of the build's configure step. This
-    is just a little more complex, but doesn't have the limitations of the other
-    methods.
+    approach doesn't have the limitations of the other methods.
 
-The last of the above methods is implemented with a small piece of CMake code in
-a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and
-then invoked as a sub-build _during the CMake stage_. That directory is then
-pulled into the main build with `add_subdirectory()`. For example:
+The last of the above methods is implemented with a small piece of CMake code
+that downloads and pulls the GoogleTest code into the main build.
 
-New file `CMakeLists.txt.in`:
+Just add to your `CMakeLists.txt`:
 
 ```cmake
-cmake_minimum_required(VERSION 2.8.2)
-
-project(googletest-download NONE)
-
-include(ExternalProject)
-ExternalProject_Add(googletest
-  GIT_REPOSITORY    https://github.com/google/googletest.git
-  GIT_TAG           master
-  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
-  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  # Specify the commit you depend on and update it regularly.
+  URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
 )
-```
-
-Existing build's `CMakeLists.txt`:
-
-```cmake
-# Download and unpack googletest at configure time
-configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
-execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
-if(result)
-  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
-endif()
-execute_process(COMMAND ${CMAKE_COMMAND} --build .
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
-if(result)
-  message(FATAL_ERROR "Build step for googletest failed: ${result}")
-endif()
-
-# Prevent overriding the parent project's compiler/linker
-# settings on Windows
+# For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-
-# Add googletest directly to our build. This defines
-# the gtest and gtest_main targets.
-add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
-                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
-                 EXCLUDE_FROM_ALL)
-
-# The gtest/gtest_main targets carry header search path
-# dependencies automatically when using CMake 2.8.11 or
-# later. Otherwise we have to add them here ourselves.
-if (CMAKE_VERSION VERSION_LESS 2.8.11)
-  include_directories("${gtest_SOURCE_DIR}/include")
-endif()
+FetchContent_MakeAvailable(googletest)
 
 # Now simply link against gtest or gtest_main as needed. Eg
 add_executable(example example.cpp)
@@ -133,20 +106,18 @@ target_link_libraries(example gtest_main)
 add_test(NAME example_test COMMAND example)
 ```
 
-Note that this approach requires CMake 2.8.2 or later due to its use of the
-`ExternalProject_Add()` command. The above technique is discussed in more detail
-in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which
-also contains a link to a fully generalized implementation of the technique.
+Note that this approach requires CMake 3.14 or later due to its use of the
+`FetchContent_MakeAvailable()` command.
 
 ##### Visual Studio Dynamic vs Static Runtimes
 
 By default, new Visual Studio projects link the C runtimes dynamically but
-Google Test links them statically. This will generate an error that looks
+GoogleTest links them statically. This will generate an error that looks
 something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
 detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
 'MDd_DynamicDebug' in main.obj
 
-Google Test already has a CMake option for this: `gtest_force_shared_crt`
+GoogleTest already has a CMake option for this: `gtest_force_shared_crt`
 
 Enabling this option will make gtest link the runtimes dynamically too, and
 match the project in which it is included.
@@ -154,17 +125,17 @@ match the project in which it is included.
 #### C++ Standard Version
 
 An environment that supports C++11 is required in order to successfully build
-Google Test. One way to ensure this is to specify the standard in the top-level
+GoogleTest. One way to ensure this is to specify the standard in the top-level
 project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this
-is not feasible, for example in a C project using Google Test for validation,
+is not feasible, for example in a C project using GoogleTest for validation,
 then it can be specified by adding it to the options for cmake via the
 `DCMAKE_CXX_FLAGS` option.
 
-### Tweaking Google Test
+### Tweaking GoogleTest
 
-Google Test can be used in diverse environments. The default configuration may
+GoogleTest can be used in diverse environments. The default configuration may
 not work (or may not work well) out of the box in some environments. However,
-you can easily tweak Google Test by defining control macros on the compiler
+you can easily tweak GoogleTest by defining control macros on the compiler
 command line. Generally, these macros are named like `GTEST_XYZ` and you define
 them to either 1 or 0 to enable or disable a certain feature.
 
@@ -173,12 +144,12 @@ We list the most frequently used macros below. For a complete list, see file
 
 ### Multi-threaded Tests
 
-Google Test is thread-safe where the pthread library is available. After
+GoogleTest is thread-safe where the pthread library is available. After
 `#include "gtest/gtest.h"`, you can check the
 `GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is
 `#defined` to 1, no if it's undefined.).
 
-If Google Test doesn't correctly detect whether pthread is available in your
+If GoogleTest doesn't correctly detect whether pthread is available in your
 environment, you can force it with
 
     -DGTEST_HAS_PTHREAD=1
@@ -187,16 +158,16 @@ or
 
     -DGTEST_HAS_PTHREAD=0
 
-When Google Test uses pthread, you may need to add flags to your compiler and/or
+When GoogleTest uses pthread, you may need to add flags to your compiler and/or
 linker to select the pthread library, or you'll get link errors. If you use the
-CMake script or the deprecated Autotools script, this is taken care of for you.
-If you use your own build script, you'll need to read your compiler and linker's
-manual to figure out what flags to add.
+CMake script, this is taken care of for you. If you use your own build script,
+you'll need to read your compiler and linker's manual to figure out what flags
+to add.
 
 ### As a Shared Library (DLL)
 
-Google Test is compact, so most users can build and link it as a static library
-for the simplicity. You can choose to use Google Test as a shared library (known
+GoogleTest is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use GoogleTest as a shared library (known
 as a DLL on Windows) if you prefer.
 
 To compile *gtest* as a shared library, add
@@ -216,22 +187,22 @@ Note: while the above steps aren't technically necessary today when using some
 compilers (e.g. GCC), they may become necessary in the future, if we decide to
 improve the speed of loading the library (see
 <http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
-to always add the above flags when using Google Test as a shared library.
-Otherwise a future release of Google Test may break your build script.
+to always add the above flags when using GoogleTest as a shared library.
+Otherwise a future release of GoogleTest may break your build script.
 
 ### Avoiding Macro Name Clashes
 
 In C++, macros don't obey namespaces. Therefore two libraries that both define a
 macro of the same name will clash if you `#include` both definitions. In case a
-Google Test macro clashes with another library, you can force Google Test to
+GoogleTest macro clashes with another library, you can force GoogleTest to
 rename its macro to avoid the conflict.
 
-Specifically, if both Google Test and some other code define macro FOO, you can
+Specifically, if both GoogleTest and some other code define macro FOO, you can
 add
 
     -DGTEST_DONT_DEFINE_FOO=1
 
-to the compiler flags to tell Google Test to change the macro's name from `FOO`
+to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
 to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
 example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
diff --git a/third_party/googletest/src/include/gtest/gtest-death-test.h b/third_party/googletest/src/include/gtest/gtest-death-test.h
index dc878ffbb3..9b4d4d1337 100644
--- a/third_party/googletest/src/include/gtest/gtest-death-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -35,8 +35,8 @@
 // directly.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 
 #include "gtest/internal/gtest-death-test-internal.h"
 
@@ -97,6 +97,10 @@ GTEST_API_ bool InDeathTestChild();
 //
 //   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
 //
+// The final parameter to each of these macros is a matcher applied to any data
+// the sub-process wrote to stderr.  For compatibility with existing tests, a
+// bare string is interpreted as a regular expression matcher.
+//
 // On the regular expressions used in death tests:
 //
 //   GOOGLETEST_CM0005 DO NOT DELETE
@@ -162,27 +166,27 @@ GTEST_API_ bool InDeathTestChild();
 //   directory in PATH.
 //
 
-// Asserts that a given statement causes the program to exit, with an
-// integer exit status that satisfies predicate, and emitting error output
-// that matches regex.
-# define ASSERT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+// Asserts that a given `statement` causes the program to exit, with an
+// integer exit status that satisfies `predicate`, and emitting error output
+// that matches `matcher`.
+# define ASSERT_EXIT(statement, predicate, matcher) \
+    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
 
-// Like ASSERT_EXIT, but continues on to successive tests in the
+// Like `ASSERT_EXIT`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+# define EXPECT_EXIT(statement, predicate, matcher) \
+    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
 
-// Asserts that a given statement causes the program to exit, either by
+// Asserts that a given `statement` causes the program to exit, either by
 // explicitly exiting with a nonzero exit code or being killed by a
-// signal, and emitting error output that matches regex.
-# define ASSERT_DEATH(statement, regex) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+// signal, and emitting error output that matches `matcher`.
+# define ASSERT_DEATH(statement, matcher) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
-// Like ASSERT_DEATH, but continues on to successive tests in the
+// Like `ASSERT_DEATH`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_DEATH(statement, regex) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+# define EXPECT_DEATH(statement, matcher) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
 
@@ -190,11 +194,10 @@ GTEST_API_ bool InDeathTestChild();
 class GTEST_API_ ExitedWithCode {
  public:
   explicit ExitedWithCode(int exit_code);
+  ExitedWithCode(const ExitedWithCode&) = default;
+  void operator=(const ExitedWithCode& other) = delete;
   bool operator()(int exit_status) const;
  private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode& other);
-
   const int exit_code_;
 };
 
@@ -340,4 +343,4 @@ class GTEST_API_ KilledBySignal {
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-matchers.h b/third_party/googletest/src/include/gtest/gtest-matchers.h
index a61cef4093..9fa34a05ba 100644
--- a/third_party/googletest/src/include/gtest/gtest-matchers.h
+++ b/third_party/googletest/src/include/gtest/gtest-matchers.h
@@ -32,13 +32,10 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
-// IWYU pragma: private, include "testing/base/public/gunit.h"
-// IWYU pragma: friend third_party/googletest/googlemock/.*
-// IWYU pragma: friend third_party/googletest/googletest/.*
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 
+#include <atomic>
 #include <memory>
 #include <ostream>
 #include <string>
@@ -63,20 +60,16 @@ GTEST_DISABLE_MSC_WARNINGS_PUSH_(
 namespace testing {
 
 // To implement a matcher Foo for type T, define:
-//   1. a class FooMatcherImpl that implements the
-//      MatcherInterface<T> interface, and
-//   2. a factory function that creates a Matcher<T> object from a
-//      FooMatcherImpl*.
-//
-// The two-level delegation design makes it possible to allow a user
-// to write "v" instead of "Eq(v)" where a Matcher is expected, which
-// is impossible if we pass matchers by pointers.  It also eases
-// ownership management as Matcher objects can now be copied like
-// plain values.
-
-// MatchResultListener is an abstract class.  Its << operator can be
-// used by a matcher to explain why a value matches or doesn't match.
+//   1. a class FooMatcherMatcher that implements the matcher interface:
+//     using is_gtest_matcher = void;
+//     bool MatchAndExplain(const T&, std::ostream*);
+//       (MatchResultListener* can also be used instead of std::ostream*)
+//     void DescribeTo(std::ostream*);
+//     void DescribeNegationTo(std::ostream*);
 //
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherMatcher.
+
 class MatchResultListener {
  public:
   // Creates a listener object with the given underlying ostream.  The
@@ -113,7 +106,7 @@ inline MatchResultListener::~MatchResultListener() {
 
 // An instance of a subclass of this knows how to describe itself as a
 // matcher.
-class MatcherDescriberInterface {
+class GTEST_API_ MatcherDescriberInterface {
  public:
   virtual ~MatcherDescriberInterface() {}
 
@@ -181,31 +174,6 @@ class MatcherInterface : public MatcherDescriberInterface {
 
 namespace internal {
 
-// Converts a MatcherInterface<T> to a MatcherInterface<const T&>.
-template <typename T>
-class MatcherInterfaceAdapter : public MatcherInterface<const T&> {
- public:
-  explicit MatcherInterfaceAdapter(const MatcherInterface<T>* impl)
-      : impl_(impl) {}
-  ~MatcherInterfaceAdapter() override { delete impl_; }
-
-  void DescribeTo(::std::ostream* os) const override { impl_->DescribeTo(os); }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    impl_->DescribeNegationTo(os);
-  }
-
-  bool MatchAndExplain(const T& x,
-                       MatchResultListener* listener) const override {
-    return impl_->MatchAndExplain(x, listener);
-  }
-
- private:
-  const MatcherInterface<T>* const impl_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter);
-};
-
 struct AnyEq {
   template <typename A, typename B>
   bool operator()(const A& a, const B& b) const { return a == b; }
@@ -252,16 +220,35 @@ class StreamMatchResultListener : public MatchResultListener {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
 };
 
+struct SharedPayloadBase {
+  std::atomic<int> ref{1};
+  void Ref() { ref.fetch_add(1, std::memory_order_relaxed); }
+  bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; }
+};
+
+template <typename T>
+struct SharedPayload : SharedPayloadBase {
+  explicit SharedPayload(const T& v) : value(v) {}
+  explicit SharedPayload(T&& v) : value(std::move(v)) {}
+
+  static void Destroy(SharedPayloadBase* shared) {
+    delete static_cast<SharedPayload*>(shared);
+  }
+
+  T value;
+};
+
 // An internal class for implementing Matcher<T>, which will derive
 // from it.  We put functionalities common to all Matcher<T>
 // specializations here to avoid code duplication.
 template <typename T>
-class MatcherBase {
+class MatcherBase : private MatcherDescriberInterface {
  public:
   // Returns true if and only if the matcher matches x; also explains the
   // match result to 'listener'.
   bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
-    return impl_->MatchAndExplain(x, listener);
+    GTEST_CHECK_(vtable_ != nullptr);
+    return vtable_->match_and_explain(*this, x, listener);
   }
 
   // Returns true if and only if this matcher matches x.
@@ -271,11 +258,15 @@ class MatcherBase {
   }
 
   // Describes this matcher to an ostream.
-  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+  void DescribeTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, false);
+  }
 
   // Describes the negation of this matcher to an ostream.
-  void DescribeNegationTo(::std::ostream* os) const {
-    impl_->DescribeNegationTo(os);
+  void DescribeNegationTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, true);
   }
 
   // Explains why x matches, or doesn't match, the matcher.
@@ -288,31 +279,194 @@ class MatcherBase {
   // of the describer, which is only guaranteed to be alive when
   // this matcher object is alive.
   const MatcherDescriberInterface* GetDescriber() const {
-    return impl_.get();
+    if (vtable_ == nullptr) return nullptr;
+    return vtable_->get_describer(*this);
   }
 
  protected:
-  MatcherBase() {}
+  MatcherBase() : vtable_(nullptr) {}
 
   // Constructs a matcher from its implementation.
-  explicit MatcherBase(const MatcherInterface<const T&>* impl) : impl_(impl) {}
-
   template <typename U>
-  explicit MatcherBase(
-      const MatcherInterface<U>* impl,
-      typename std::enable_if<!std::is_same<U, const U&>::value>::type* =
-          nullptr)
-      : impl_(new internal::MatcherInterfaceAdapter<U>(impl)) {}
+  explicit MatcherBase(const MatcherInterface<U>* impl) {
+    Init(impl);
+  }
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  MatcherBase(M&& m) {  // NOLINT
+    Init(std::forward<M>(m));
+  }
+
+  MatcherBase(const MatcherBase& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    if (IsShared()) buffer_.shared->Ref();
+  }
+
+  MatcherBase& operator=(const MatcherBase& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    if (IsShared()) buffer_.shared->Ref();
+    return *this;
+  }
 
-  MatcherBase(const MatcherBase&) = default;
-  MatcherBase& operator=(const MatcherBase&) = default;
-  MatcherBase(MatcherBase&&) = default;
-  MatcherBase& operator=(MatcherBase&&) = default;
+  MatcherBase(MatcherBase&& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    other.vtable_ = nullptr;
+  }
+
+  MatcherBase& operator=(MatcherBase&& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    other.vtable_ = nullptr;
+    return *this;
+  }
 
-  virtual ~MatcherBase() {}
+  ~MatcherBase() override { Destroy(); }
 
  private:
-  std::shared_ptr<const MatcherInterface<const T&>> impl_;
+  struct VTable {
+    bool (*match_and_explain)(const MatcherBase&, const T&,
+                              MatchResultListener*);
+    void (*describe)(const MatcherBase&, std::ostream*, bool negation);
+    // Returns the captured object if it implements the interface, otherwise
+    // returns the MatcherBase itself.
+    const MatcherDescriberInterface* (*get_describer)(const MatcherBase&);
+    // Called on shared instances when the reference count reaches 0.
+    void (*shared_destroy)(SharedPayloadBase*);
+  };
+
+  bool IsShared() const {
+    return vtable_ != nullptr && vtable_->shared_destroy != nullptr;
+  }
+
+  // If the implementation uses a listener, call that.
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) {
+    return P::Get(m).MatchAndExplain(value, listener->stream());
+  }
+
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener)) {
+    return P::Get(m).MatchAndExplain(value, listener);
+  }
+
+  template <typename P>
+  static void DescribeImpl(const MatcherBase& m, std::ostream* os,
+                           bool negation) {
+    if (negation) {
+      P::Get(m).DescribeNegationTo(os);
+    } else {
+      P::Get(m).DescribeTo(os);
+    }
+  }
+
+  template <typename P>
+  static const MatcherDescriberInterface* GetDescriberImpl(
+      const MatcherBase& m) {
+    // If the impl is a MatcherDescriberInterface, then return it.
+    // Otherwise use MatcherBase itself.
+    // This allows us to implement the GetDescriber() function without support
+    // from the impl, but some users really want to get their impl back when
+    // they call GetDescriber().
+    // We use std::get on a tuple as a workaround of not having `if constexpr`.
+    return std::get<(
+        std::is_convertible<decltype(&P::Get(m)),
+                            const MatcherDescriberInterface*>::value
+            ? 1
+            : 0)>(std::make_tuple(&m, &P::Get(m)));
+  }
+
+  template <typename P>
+  const VTable* GetVTable() {
+    static constexpr VTable kVTable = {&MatchAndExplainImpl<P>,
+                                       &DescribeImpl<P>, &GetDescriberImpl<P>,
+                                       P::shared_destroy};
+    return &kVTable;
+  }
+
+  union Buffer {
+    // Add some types to give Buffer some common alignment/size use cases.
+    void* ptr;
+    double d;
+    int64_t i;
+    // And add one for the out-of-line cases.
+    SharedPayloadBase* shared;
+  };
+
+  void Destroy() {
+    if (IsShared() && buffer_.shared->Unref()) {
+      vtable_->shared_destroy(buffer_.shared);
+    }
+  }
+
+  template <typename M>
+  static constexpr bool IsInlined() {
+    return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) &&
+           std::is_trivially_copy_constructible<M>::value &&
+           std::is_trivially_destructible<M>::value;
+  }
+
+  template <typename M, bool = MatcherBase::IsInlined<M>()>
+  struct ValuePolicy {
+    static const M& Get(const MatcherBase& m) {
+      // When inlined along with Init, need to be explicit to avoid violating
+      // strict aliasing rules.
+      const M *ptr = static_cast<const M*>(
+          static_cast<const void*>(&m.buffer_));
+      return *ptr;
+    }
+    static void Init(MatcherBase& m, M impl) {
+      ::new (static_cast<void*>(&m.buffer_)) M(impl);
+    }
+    static constexpr auto shared_destroy = nullptr;
+  };
+
+  template <typename M>
+  struct ValuePolicy<M, false> {
+    using Shared = SharedPayload<M>;
+    static const M& Get(const MatcherBase& m) {
+      return static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    template <typename Arg>
+    static void Init(MatcherBase& m, Arg&& arg) {
+      m.buffer_.shared = new Shared(std::forward<Arg>(arg));
+    }
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename U, bool B>
+  struct ValuePolicy<const MatcherInterface<U>*, B> {
+    using M = const MatcherInterface<U>;
+    using Shared = SharedPayload<std::unique_ptr<M>>;
+    static const M& Get(const MatcherBase& m) {
+      return *static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    static void Init(MatcherBase& m, M* impl) {
+      m.buffer_.shared = new Shared(std::unique_ptr<M>(impl));
+    }
+
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename M>
+  void Init(M&& m) {
+    using MM = typename std::decay<M>::type;
+    using Policy = ValuePolicy<MM>;
+    vtable_ = GetVTable<Policy>();
+    Policy::Init(*this, std::forward<M>(m));
+  }
+
+  const VTable* vtable_;
+  Buffer buffer_;
 };
 
 }  // namespace internal
@@ -340,6 +494,10 @@ class Matcher : public internal::MatcherBase<T> {
           nullptr)
       : internal::MatcherBase<T>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m) : internal::MatcherBase<T>(std::forward<M>(m)) {}  // NOLINT
+
   // Implicit constructor here allows people to write
   // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
   Matcher(T value);  // NOLINT
@@ -357,6 +515,11 @@ class GTEST_API_ Matcher<const std::string&>
   explicit Matcher(const MatcherInterface<const std::string&>* impl)
       : internal::MatcherBase<const std::string&>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const std::string&>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
   Matcher(const std::string& s);  // NOLINT
@@ -376,6 +539,11 @@ class GTEST_API_ Matcher<std::string>
   explicit Matcher(const MatcherInterface<std::string>* impl)
       : internal::MatcherBase<std::string>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<std::string>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a string object.
   Matcher(const std::string& s);  // NOLINT
@@ -397,6 +565,12 @@ class GTEST_API_ Matcher<const internal::StringView&>
   explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
       : internal::MatcherBase<const internal::StringView&>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const internal::StringView&>(std::forward<M>(m)) {
+  }
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
   Matcher(const std::string& s);  // NOLINT
@@ -419,6 +593,11 @@ class GTEST_API_ Matcher<internal::StringView>
   explicit Matcher(const MatcherInterface<internal::StringView>* impl)
       : internal::MatcherBase<internal::StringView>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<internal::StringView>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
   Matcher(const std::string& s);  // NOLINT
@@ -529,37 +708,32 @@ template <typename D, typename Rhs, typename Op>
 class ComparisonBase {
  public:
   explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
+
+  using is_gtest_matcher = void;
+
   template <typename Lhs>
-  operator Matcher<Lhs>() const {
-    return Matcher<Lhs>(new Impl<const Lhs&>(rhs_));
+  bool MatchAndExplain(const Lhs& lhs, std::ostream*) const {
+    return Op()(lhs, Unwrap(rhs_));
+  }
+  void DescribeTo(std::ostream* os) const {
+    *os << D::Desc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
+  }
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << D::NegatedDesc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
   }
 
  private:
   template <typename T>
-  static const T& Unwrap(const T& v) { return v; }
+  static const T& Unwrap(const T& v) {
+    return v;
+  }
   template <typename T>
-  static const T& Unwrap(std::reference_wrapper<T> v) { return v; }
-
-  template <typename Lhs, typename = Rhs>
-  class Impl : public MatcherInterface<Lhs> {
-   public:
-    explicit Impl(const Rhs& rhs) : rhs_(rhs) {}
-    bool MatchAndExplain(Lhs lhs,
-                         MatchResultListener* /* listener */) const override {
-      return Op()(lhs, Unwrap(rhs_));
-    }
-    void DescribeTo(::std::ostream* os) const override {
-      *os << D::Desc() << " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << D::NegatedDesc() <<  " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
+  static const T& Unwrap(std::reference_wrapper<T> v) {
+    return v;
+  }
 
-   private:
-    Rhs rhs_;
-  };
   Rhs rhs_;
 };
 
@@ -612,6 +786,10 @@ class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
   static const char* NegatedDesc() { return "isn't >="; }
 };
 
+template <typename T, typename = typename std::enable_if<
+                          std::is_constructible<std::string, T>::value>::type>
+using StringLike = T;
+
 // Implements polymorphic matchers MatchesRegex(regex) and
 // ContainsRegex(regex), which can be used as a Matcher<T> as long as
 // T can be converted to a string.
@@ -672,9 +850,10 @@ inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
     const internal::RE* regex) {
   return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
 }
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
-    const std::string& regex) {
-  return MatchesRegex(new internal::RE(regex));
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::StringLike<T>& regex) {
+  return MatchesRegex(new internal::RE(std::string(regex)));
 }
 
 // Matches a string that contains regular expression 'regex'.
@@ -683,9 +862,10 @@ inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
     const internal::RE* regex) {
   return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
 }
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
-    const std::string& regex) {
-  return ContainsRegex(new internal::RE(regex));
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::StringLike<T>& regex) {
+  return ContainsRegex(new internal::RE(std::string(regex)));
 }
 
 // Creates a polymorphic matcher that matches anything equal to x.
@@ -747,4 +927,4 @@ inline internal::NeMatcher<Rhs> Ne(Rhs x) {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-message.h b/third_party/googletest/src/include/gtest/gtest-message.h
index 21899232a2..becfd49fcb 100644
--- a/third_party/googletest/src/include/gtest/gtest-message.h
+++ b/third_party/googletest/src/include/gtest/gtest-message.h
@@ -44,8 +44,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 
 #include <limits>
 #include <memory>
@@ -216,4 +216,4 @@ std::string StreamableToString(const T& streamable) {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h b/third_party/googletest/src/include/gtest/gtest-param-test.h
index 5b039df9f6..804e702817 100644
--- a/third_party/googletest/src/include/gtest/gtest-param-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -30,12 +30,9 @@
 // Macros and functions for implementing parameterized tests
 // in Google C++ Testing and Mocking Framework (Google Test)
 //
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
 // GOOGLETEST_CM0001 DO NOT DELETE
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
 // Value-parameterized tests allow you to test your code with different
 // parameters without writing multiple copies of the same test.
@@ -371,8 +368,6 @@ inline internal::ParamGenerator<bool> Bool() {
 //     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
 //     of elements from sequences produces by gen1, gen2, ..., genN.
 //
-// Combine can have up to 10 arguments.
-//
 // Example:
 //
 // This will instantiate tests in test suite AnimalTest each one with
@@ -428,7 +423,8 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
           ->AddTestPattern(                                                    \
               GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
               new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_suite_name, test_name)>());                             \
+                  test_suite_name, test_name)>(),                              \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__));          \
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
@@ -508,4 +504,4 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-printers.h b/third_party/googletest/src/include/gtest/gtest-printers.h
index 407d1f1859..076c9de1f4 100644
--- a/third_party/googletest/src/include/gtest/gtest-printers.h
+++ b/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -97,10 +97,11 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
 #include <functional>
+#include <memory>
 #include <ostream>  // NOLINT
 #include <sstream>
 #include <string>
@@ -108,64 +109,124 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-port.h"
 
-#if GTEST_HAS_ABSL
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
-#include "absl/types/variant.h"
-#endif  // GTEST_HAS_ABSL
-
 namespace testing {
 
-// Definitions in the 'internal' and 'internal2' name spaces are
-// subject to change without notice.  DO NOT USE THEM IN USER CODE!
-namespace internal2 {
+// Definitions in the internal* namespaces are subject to change without notice.
+// DO NOT USE THEM IN USER CODE!
+namespace internal {
 
-// Prints the given number of bytes in the given object to the given
-// ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
 
-// For selecting which printer to use when a given type has neither <<
-// nor PrintTo().
-enum TypeKind {
-  kProtobuf,              // a protobuf type
-  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
-                          // (e.g. a named or unnamed enum type)
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-  kConvertibleToStringView,  // a type implicitly convertible to
-                             // absl::string_view or std::string_view
-#endif
-  kOtherType  // anything else
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+struct ContainerPrinter {
+  template <typename T,
+            typename = typename std::enable_if<
+                (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+                !IsRecursiveContainer<T>::value>::type>
+  static void PrintValue(const T& container, std::ostream* os) {
+    const size_t kMaxCount = 32;  // The maximum number of elements to print.
+    *os << '{';
+    size_t count = 0;
+    for (auto&& elem : container) {
+      if (count > 0) {
+        *os << ',';
+        if (count == kMaxCount) {  // Enough has been printed.
+          *os << " ...";
+          break;
+        }
+      }
+      *os << ' ';
+      // We cannot call PrintTo(elem, os) here as PrintTo() doesn't
+      // handle `elem` being a native array.
+      internal::UniversalPrint(elem, os);
+      ++count;
+    }
+
+    if (count > 0) {
+      *os << ' ';
+    }
+    *os << '}';
+  }
 };
 
-// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
-// by the universal printer to print a value of type T when neither
-// operator<< nor PrintTo() is defined for T, where kTypeKind is the
-// "kind" of T as defined by enum TypeKind.
-template <typename T, TypeKind kTypeKind>
-class TypeWithoutFormatter {
- public:
-  // This default version is called when kTypeKind is kOtherType.
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+struct FunctionPointerPrinter {
+  template <typename T, typename = typename std::enable_if<
+                            std::is_function<T>::value>::type>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.
+      *os << reinterpret_cast<const void*>(p);
+    }
+  }
+};
+
+struct PointerPrinter {
+  template <typename T>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    }
+  }
+};
+
+namespace internal_stream_operator_without_lexical_name_lookup {
+
+// The presence of an operator<< here will terminate lexical scope lookup
+// straight away (even though it cannot be a match because of its argument
+// types). Thus, the two operator<< calls in StreamPrinter will find only ADL
+// candidates.
+struct LookupBlocker {};
+void operator<<(LookupBlocker, LookupBlocker);
+
+struct StreamPrinter {
+  template <typename T,
+            // Don't accept member pointers here. We'd print them via implicit
+            // conversion to bool, which isn't useful.
+            typename = typename std::enable_if<
+                !std::is_member_pointer<T>::value>::type,
+            // Only accept types for which we can find a streaming operator via
+            // ADL (possibly involving implicit conversions).
+            typename = decltype(std::declval<std::ostream&>()
+                                << std::declval<const T&>())>
   static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(
-        static_cast<const unsigned char*>(
-            reinterpret_cast<const void*>(std::addressof(value))),
-        sizeof(value), os);
+    // Call streaming operator found by ADL, possibly with implicit conversions
+    // of the arguments.
+    *os << value;
   }
 };
 
-// We print a protobuf using its ShortDebugString() when the string
-// doesn't exceed this many characters; otherwise we print it using
-// DebugString() for better readability.
-const size_t kProtobufOneLinerMaxLength = 50;
+}  // namespace internal_stream_operator_without_lexical_name_lookup
 
-template <typename T>
-class TypeWithoutFormatter<T, kProtobuf> {
- public:
+struct ProtobufPrinter {
+  // We print a protobuf using its ShortDebugString() when the string
+  // doesn't exceed this many characters; otherwise we print it using
+  // DebugString() for better readability.
+  static const size_t kProtobufOneLinerMaxLength = 50;
+
+  template <typename T,
+            typename = typename std::enable_if<
+                internal::HasDebugStringAndShortDebugString<T>::value>::type>
   static void PrintValue(const T& value, ::std::ostream* os) {
     std::string pretty_str = value.ShortDebugString();
     if (pretty_str.length() > kProtobufOneLinerMaxLength) {
@@ -175,9 +236,7 @@ class TypeWithoutFormatter<T, kProtobuf> {
   }
 };
 
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToInteger> {
- public:
+struct ConvertibleToIntegerPrinter {
   // Since T has no << operator or PrintTo() but can be implicitly
   // converted to BiggestInt, we print it as a BiggestInt.
   //
@@ -185,111 +244,73 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
   // case printing it as an integer is the desired behavior.  In case
   // T is not an enum, printing it as an integer is the best we can do
   // given that it has no user-defined printer.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const internal::BiggestInt kBigInt = value;
-    *os << kBigInt;
+  static void PrintValue(internal::BiggestInt value, ::std::ostream* os) {
+    *os << value;
   }
 };
 
+struct ConvertibleToStringViewPrinter {
 #if GTEST_INTERNAL_HAS_STRING_VIEW
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToStringView> {
- public:
-  // Since T has neither operator<< nor PrintTo() but can be implicitly
-  // converted to absl::string_view, we print it as a absl::string_view
-  // (or std::string_view).
-  //
-  // Note: the implementation is further below, as it depends on
-  // internal::PrintTo symbol which is defined later in the file.
-  static void PrintValue(const T& value, ::std::ostream* os);
-};
+  static void PrintValue(internal::StringView value, ::std::ostream* os) {
+    internal::UniversalPrint(value, os);
+  }
 #endif
+};
 
-// Prints the given value to the given ostream.  If the value is a
-// protocol message, its debug string is printed; if it's an enum or
-// of a type implicitly convertible to BiggestInt, it's printed as an
-// integer; otherwise the bytes in the value are printed.  This is
-// what UniversalPrinter<T>::Print() does when it knows nothing about
-// type T and T has neither << operator nor PrintTo().
-//
-// A user can override this behavior for a class type Foo by defining
-// a << operator in the namespace where Foo is defined.
-//
-// We put this operator in namespace 'internal2' instead of 'internal'
-// to simplify the implementation, as much code in 'internal' needs to
-// use << in STL, which would conflict with our own << were it defined
-// in 'internal'.
-//
-// Note that this operator<< takes a generic std::basic_ostream<Char,
-// CharTraits> type instead of the more restricted std::ostream.  If
-// we define it to take an std::ostream instead, we'll get an
-// "ambiguous overloads" compiler error when trying to print a type
-// Foo that supports streaming to std::basic_ostream<Char,
-// CharTraits>, as the compiler cannot tell whether
-// operator<<(std::ostream&, const T&) or
-// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
-// specific.
-template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits>& operator<<(
-    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
-                               ? kProtobuf
-                               : std::is_convertible<
-                                     const T&, internal::BiggestInt>::value
-                                     ? kConvertibleToInteger
-                                     :
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-                                     std::is_convertible<
-                                         const T&, internal::StringView>::value
-                                         ? kConvertibleToStringView
-                                         :
-#endif
-                                         kOtherType)>::PrintValue(x, &os);
-  return os;
-}
 
-}  // namespace internal2
-}  // namespace testing
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+struct RawBytesPrinter {
+  // SFINAE on `sizeof` to make sure we have a complete type.
+  template <typename T, size_t = sizeof(T)>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(
+        static_cast<const unsigned char*>(
+            // Load bearing cast to void* to support iOS
+            reinterpret_cast<const void*>(std::addressof(value))),
+        sizeof(value), os);
+  }
+};
 
-// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
-// magic needed for implementing UniversalPrinter won't work.
-namespace testing_internal {
+struct FallbackPrinter {
+  template <typename T>
+  static void PrintValue(const T&, ::std::ostream* os) {
+    *os << "(incomplete type)";
+  }
+};
 
-// Used to print a value that is not an STL-style container when the
-// user doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
-  // With the following statement, during unqualified name lookup,
-  // testing::internal2::operator<< appears as if it was declared in
-  // the nearest enclosing namespace that contains both
-  // ::testing_internal and ::testing::internal2, i.e. the global
-  // namespace.  For more details, refer to the C++ Standard section
-  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
-  // testing::internal2::operator<< in case T doesn't come with a <<
-  // operator.
-
-  using ::testing::internal2::operator<<;
-
-  // Assuming T is defined in namespace foo, in the next statement,
-  // the compiler will consider all of:
-  //
-  //   1. foo::operator<< (thanks to Koenig look-up),
-  //   2. ::operator<< (as the current namespace is enclosed in ::),
-  //   3. testing::internal2::operator<< (thanks to the using statement above).
-  //
-  // The operator<< whose type matches T best will be picked.
-  //
-  // We deliberately allow #2 to be a candidate, as sometimes it's
-  // impossible to define #1 (e.g. when foo is ::std, defining
-  // anything in it is undefined behavior unless you are a compiler
-  // vendor.).
-  *os << value;
-}
+// Try every printer in order and return the first one that works.
+template <typename T, typename E, typename Printer, typename... Printers>
+struct FindFirstPrinter : FindFirstPrinter<T, E, Printers...> {};
 
-}  // namespace testing_internal
+template <typename T, typename Printer, typename... Printers>
+struct FindFirstPrinter<
+    T, decltype(Printer::PrintValue(std::declval<const T&>(), nullptr)),
+    Printer, Printers...> {
+  using type = Printer;
+};
 
-namespace testing {
-namespace internal {
+// Select the best printer in the following order:
+//  - Print containers (they have begin/end/etc).
+//  - Print function pointers.
+//  - Print object pointers.
+//  - Use the stream operator, if available.
+//  - Print protocol buffers.
+//  - Print types convertible to BiggestInt.
+//  - Print types convertible to StringView, if available.
+//  - Fallback to printing the raw bytes of the object.
+template <typename T>
+void PrintWithFallback(const T& value, ::std::ostream* os) {
+  using Printer = typename FindFirstPrinter<
+      T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter,
+      internal_stream_operator_without_lexical_name_lookup::StreamPrinter,
+      ProtobufPrinter, ConvertibleToIntegerPrinter,
+      ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type;
+  Printer::PrintValue(value, os);
+}
 
 // FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
 // value of type ToPrint that is an operand of a comparison assertion
@@ -339,6 +360,14 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
 
 #undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
 
@@ -356,6 +385,14 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string);
 
 #if GTEST_HAS_STD_WSTRING
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
@@ -388,85 +425,6 @@ std::string FormatForComparisonFailureMessage(
 template <typename T>
 class UniversalPrinter;
 
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os);
-
-enum DefaultPrinterType {
-  kPrintContainer,
-  kPrintPointer,
-  kPrintFunctionPointer,
-  kPrintOther,
-};
-template <DefaultPrinterType type> struct WrapPrinterType {};
-
-// Used to print an STL-style container when the user doesn't define
-// a PrintTo() for it.
-template <typename C>
-void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
-                    const C& container, ::std::ostream* os) {
-  const size_t kMaxCount = 32;  // The maximum number of elements to print.
-  *os << '{';
-  size_t count = 0;
-  for (typename C::const_iterator it = container.begin();
-       it != container.end(); ++it, ++count) {
-    if (count > 0) {
-      *os << ',';
-      if (count == kMaxCount) {  // Enough has been printed.
-        *os << " ...";
-        break;
-      }
-    }
-    *os << ' ';
-    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
-    // handle *it being a native array.
-    internal::UniversalPrint(*it, os);
-  }
-
-  if (count > 0) {
-    *os << ' ';
-  }
-  *os << '}';
-}
-
-// Used to print a pointer that is neither a char pointer nor a member
-// pointer, when the user doesn't define PrintTo() for it.  (A member
-// variable pointer or member function pointer doesn't really point to
-// a location in the address space.  Their representation is
-// implementation-defined.  Therefore they will be printed as raw
-// bytes.)
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
-                    T* p, ::std::ostream* os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is not a function type.  We just call << to print p,
-    // relying on ADL to pick up user-defined << for their pointer
-    // types, if any.
-    *os << p;
-  }
-}
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
-                    T* p, ::std::ostream* os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is a function type, so '*os << p' doesn't do what we want
-    // (it just prints p as bool).  We want to print p as a const
-    // void*.
-    *os << reinterpret_cast<const void*>(p);
-  }
-}
-
-// Used to print a non-container, non-pointer value when the user
-// doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
-                    const T& value, ::std::ostream* os) {
-  ::testing_internal::DefaultPrintNonContainerTo(value, os);
-}
-
 // Prints the given value using the << operator if it has one;
 // otherwise prints the bytes in it.  This is what
 // UniversalPrinter<T>::Print() does when PrintTo() is not specialized
@@ -480,36 +438,7 @@ void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
 // wants).
 template <typename T>
 void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first argument
-  // determines which version will be picked.
-  //
-  // Note that we check for container types here, prior to we check
-  // for protocol message types in our operator<<.  The rationale is:
-  //
-  // For protocol messages, we want to give people a chance to
-  // override Google Mock's format by defining a PrintTo() or
-  // operator<<.  For STL containers, other formats can be
-  // incompatible with Google Mock's format for the container
-  // elements; therefore we check for container types here to ensure
-  // that our format is used.
-  //
-  // Note that MSVC and clang-cl do allow an implicit conversion from
-  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
-  // So don't use ImplicitlyConvertible if it can be helped since it will
-  // cause this warning, and use a separate overload of DefaultPrintTo for
-  // function pointers so that the `*os << p` in the object pointer overload
-  // doesn't cause that warning either.
-  DefaultPrintTo(
-      WrapPrinterType <
-                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
-              !IsRecursiveContainer<T>::value
-          ? kPrintContainer
-          : !std::is_pointer<T>::value
-                ? kPrintOther
-                : std::is_function<typename std::remove_pointer<T>::type>::value
-                      ? kPrintFunctionPointer
-                      : kPrintPointer > (),
-      value, os);
+  internal::PrintWithFallback(value, os);
 }
 
 // The following list of PrintTo() overloads tells
@@ -540,6 +469,16 @@ inline void PrintTo(bool x, ::std::ostream* os) {
 // is implemented as an unsigned type.
 GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
 
+GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os);
+inline void PrintTo(char16_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#ifdef __cpp_char8_t
+inline void PrintTo(char8_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#endif
+
 // Overloads for C strings.
 GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
 inline void PrintTo(char* s, ::std::ostream* os) {
@@ -560,6 +499,23 @@ inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
 inline void PrintTo(unsigned char* s, ::std::ostream* os) {
   PrintTo(ImplicitCast_<const void*>(s), os);
 }
+#ifdef __cpp_char8_t
+// Overloads for u8 strings.
+GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os);
+inline void PrintTo(char8_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char8_t*>(s), os);
+}
+#endif
+// Overloads for u16 strings.
+GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os);
+inline void PrintTo(char16_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char16_t*>(s), os);
+}
+// Overloads for u32 strings.
+GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os);
+inline void PrintTo(char32_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char32_t*>(s), os);
+}
 
 // MSVC can be configured to define wchar_t as a typedef of unsigned
 // short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
@@ -594,6 +550,26 @@ inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
   PrintStringTo(s, os);
 }
 
+// Overloads for ::std::u8string
+#ifdef __cpp_char8_t
+GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) {
+  PrintU8StringTo(s, os);
+}
+#endif
+
+// Overloads for ::std::u16string
+GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) {
+  PrintU16StringTo(s, os);
+}
+
+// Overloads for ::std::u32string
+GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
+  PrintU32StringTo(s, os);
+}
+
 // Overloads for ::std::wstring.
 #if GTEST_HAS_STD_WSTRING
 GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
@@ -616,6 +592,43 @@ void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
   UniversalPrinter<T&>::Print(ref.get(), os);
 }
 
+inline const void* VoidifyPointer(const void* p) { return p; }
+inline const void* VoidifyPointer(volatile const void* p) {
+  return const_cast<const void*>(p);
+}
+
+template <typename T, typename Ptr>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    // We can't print the value. Just print the pointer..
+    *os << "(" << (VoidifyPointer)(ptr.get()) << ")";
+  }
+}
+template <typename T, typename Ptr,
+          typename = typename std::enable_if<!std::is_void<T>::value &&
+                                             !std::is_array<T>::value>::type>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = ";
+    UniversalPrinter<T>::Print(*ptr, os);
+    *os << ")";
+  }
+}
+
+template <typename T, typename D>
+void PrintTo(const std::unique_ptr<T, D>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
+template <typename T>
+void PrintTo(const std::shared_ptr<T>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
 template <typename T>
@@ -681,14 +694,46 @@ class UniversalPrinter {
   GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
-#if GTEST_HAS_ABSL
+// Remove any const-qualifiers before passing a type to UniversalPrinter.
+template <typename T>
+class UniversalPrinter<const T> : public UniversalPrinter<T> {};
+
+#if GTEST_INTERNAL_HAS_ANY
 
-// Printer for absl::optional
+// Printer for std::any / absl::any
+
+template <>
+class UniversalPrinter<Any> {
+ public:
+  static void Print(const Any& value, ::std::ostream* os) {
+    if (value.has_value()) {
+      *os << "value of type " << GetTypeName(value);
+    } else {
+      *os << "no value";
+    }
+  }
+
+ private:
+  static std::string GetTypeName(const Any& value) {
+#if GTEST_HAS_RTTI
+    return internal::GetTypeName(value.type());
+#else
+    static_cast<void>(value);  // possibly unused
+    return "<unknown_type>";
+#endif  // GTEST_HAS_RTTI
+  }
+};
+
+#endif  // GTEST_INTERNAL_HAS_ANY
+
+#if GTEST_INTERNAL_HAS_OPTIONAL
+
+// Printer for std::optional / absl::optional
 
 template <typename T>
-class UniversalPrinter<::absl::optional<T>> {
+class UniversalPrinter<Optional<T>> {
  public:
-  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
+  static void Print(const Optional<T>& value, ::std::ostream* os) {
     *os << '(';
     if (!value) {
       *os << "nullopt";
@@ -699,14 +744,22 @@ class UniversalPrinter<::absl::optional<T>> {
   }
 };
 
-// Printer for absl::variant
+#endif  // GTEST_INTERNAL_HAS_OPTIONAL
+
+#if GTEST_INTERNAL_HAS_VARIANT
+
+// Printer for std::variant / absl::variant
 
 template <typename... T>
-class UniversalPrinter<::absl::variant<T...>> {
+class UniversalPrinter<Variant<T...>> {
  public:
-  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
+  static void Print(const Variant<T...>& value, ::std::ostream* os) {
     *os << '(';
-    absl::visit(Visitor{os}, value);
+#if GTEST_HAS_ABSL
+    absl::visit(Visitor{os, value.index()}, value);
+#else
+    std::visit(Visitor{os, value.index()}, value);
+#endif  // GTEST_HAS_ABSL
     *os << ')';
   }
 
@@ -714,14 +767,16 @@ class UniversalPrinter<::absl::variant<T...>> {
   struct Visitor {
     template <typename U>
     void operator()(const U& u) const {
-      *os << "'" << GetTypeName<U>() << "' with value ";
+      *os << "'" << GetTypeName<U>() << "(index = " << index
+          << ")' with value ";
       UniversalPrint(u, os);
     }
     ::std::ostream* os;
+    std::size_t index;
   };
 };
 
-#endif  // GTEST_HAS_ABSL
+#endif  // GTEST_INTERNAL_HAS_VARIANT
 
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
 // elements, starting at address 'begin'.
@@ -750,6 +805,20 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
 GTEST_API_ void UniversalPrintArray(
     const char* begin, size_t len, ::std::ostream* os);
 
+#ifdef __cpp_char8_t
+// This overload prints a (const) char8_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len,
+                                    ::std::ostream* os);
+#endif
+
+// This overload prints a (const) char16_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len,
+                                    ::std::ostream* os);
+
+// This overload prints a (const) char32_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
+                                    ::std::ostream* os);
+
 // This overload prints a (const) wchar_t array compactly.
 GTEST_API_ void UniversalPrintArray(
     const wchar_t* begin, size_t len, ::std::ostream* os);
@@ -822,12 +891,55 @@ class UniversalTersePrinter<const char*> {
   }
 };
 template <>
-class UniversalTersePrinter<char*> {
+class UniversalTersePrinter<char*> : public UniversalTersePrinter<const char*> {
+};
+
+#ifdef __cpp_char8_t
+template <>
+class UniversalTersePrinter<const char8_t*> {
  public:
-  static void Print(char* str, ::std::ostream* os) {
-    UniversalTersePrinter<const char*>::Print(str, os);
+  static void Print(const char8_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u8string(str), os);
+    }
   }
 };
+template <>
+class UniversalTersePrinter<char8_t*>
+    : public UniversalTersePrinter<const char8_t*> {};
+#endif
+
+template <>
+class UniversalTersePrinter<const char16_t*> {
+ public:
+  static void Print(const char16_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u16string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char16_t*>
+    : public UniversalTersePrinter<const char16_t*> {};
+
+template <>
+class UniversalTersePrinter<const char32_t*> {
+ public:
+  static void Print(const char32_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u32string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char32_t*>
+    : public UniversalTersePrinter<const char32_t*> {};
 
 #if GTEST_HAS_STD_WSTRING
 template <>
@@ -900,16 +1012,6 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
 
 }  // namespace internal
 
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-namespace internal2 {
-template <typename T>
-void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
-    const T& value, ::std::ostream* os) {
-  internal::PrintTo(internal::StringView(value), os);
-}
-}  // namespace internal2
-#endif
-
 template <typename T>
 ::std::string PrintToString(const T& value) {
   ::std::stringstream ss;
@@ -924,4 +1026,4 @@ ::std::string PrintToString(const T& value) {
 // declarations from this file.
 #include "gtest/internal/custom/gtest-printers.h"
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-spi.h b/third_party/googletest/src/include/gtest/gtest-spi.h
index aa38870e8e..eacef44669 100644
--- a/third_party/googletest/src/include/gtest/gtest-spi.h
+++ b/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -33,8 +33,8 @@
 
 // GOOGLETEST_CM0004 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 
 #include "gtest/gtest.h"
 
@@ -235,4 +235,4 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
     }\
   } while (::testing::internal::AlwaysFalse())
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-test-part.h b/third_party/googletest/src/include/gtest/gtest-test-part.h
index 05a7985358..203fdf98c6 100644
--- a/third_party/googletest/src/include/gtest/gtest-test-part.h
+++ b/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -29,8 +29,8 @@
 //
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 
 #include <iosfwd>
 #include <vector>
@@ -181,4 +181,4 @@ class GTEST_API_ HasNewFatalFailureHelper
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-typed-test.h b/third_party/googletest/src/include/gtest/gtest-typed-test.h
index 3ffa50b739..9fdc6be10d 100644
--- a/third_party/googletest/src/include/gtest/gtest-typed-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -29,8 +29,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 
 // This header implements typed tests and type-parameterized tests.
 
@@ -175,8 +175,6 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 
 // Implements typed tests.
 
-#if GTEST_HAS_TYPED_TEST
-
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the name of the typedef for the type parameters of the
@@ -230,12 +228,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   TYPED_TEST_SUITE
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#endif  // GTEST_HAS_TYPED_TEST
-
 // Implements type-parameterized tests.
 
-#if GTEST_HAS_TYPED_TEST_P
-
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the namespace name that the type-parameterized tests for
@@ -332,6 +326,4 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   INSTANTIATE_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h
index 39cff08d65..7a5d057c4a 100644
--- a/third_party/googletest/src/include/gtest/gtest.h
+++ b/third_party/googletest/src/include/gtest/gtest.h
@@ -49,8 +49,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 
 #include <cstddef>
 #include <limits>
@@ -101,6 +101,10 @@ GTEST_DECLARE_bool_(catch_exceptions);
 // to let Google Test decide.
 GTEST_DECLARE_string_(color);
 
+// This flag controls whether the test runner should continue execution past
+// first failure.
+GTEST_DECLARE_bool_(fail_fast);
+
 // This flag sets up the filter to select by name using a glob pattern
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
@@ -117,6 +121,9 @@ GTEST_DECLARE_bool_(list_tests);
 // in addition to its normal textual output.
 GTEST_DECLARE_string_(output);
 
+// This flags control whether Google Test prints only test failures.
+GTEST_DECLARE_bool_(brief);
+
 // This flags control whether Google Test prints the elapsed time for each
 // test.
 GTEST_DECLARE_bool_(print_time);
@@ -411,10 +418,10 @@ class GTEST_API_ Test {
   // The d'tor is virtual as we intend to inherit from Test.
   virtual ~Test();
 
-  // Sets up the stuff shared by all tests in this test case.
+  // Sets up the stuff shared by all tests in this test suite.
   //
   // Google Test will call Foo::SetUpTestSuite() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
+  // test in test suite Foo.  Hence a sub-class can define its own
   // SetUpTestSuite() method to shadow the one defined in the super
   // class.
   static void SetUpTestSuite() {}
@@ -422,12 +429,13 @@ class GTEST_API_ Test {
   // Tears down the stuff shared by all tests in this test suite.
   //
   // Google Test will call Foo::TearDownTestSuite() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
+  // test in test suite Foo.  Hence a sub-class can define its own
   // TearDownTestSuite() method to shadow the one defined in the super
   // class.
   static void TearDownTestSuite() {}
 
-  // Legacy API is deprecated but still available
+  // Legacy API is deprecated but still available. Use SetUpTestSuite and
+  // TearDownTestSuite instead.
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   static void TearDownTestCase() {}
   static void SetUpTestCase() {}
@@ -665,7 +673,7 @@ class GTEST_API_ TestResult {
 
   // Protects mutable state of the property vector and of owned
   // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
+  internal::Mutex test_properties_mutex_;
 
   // The vector of TestPartResults
   std::vector<TestPartResult> test_part_results_;
@@ -795,6 +803,9 @@ class GTEST_API_ TestInfo {
   // deletes it.
   void Run();
 
+  // Skip and records the test result for this object.
+  void Skip();
+
   static void ClearTestResult(TestInfo* test_info) {
     test_info->result_.Clear();
   }
@@ -943,6 +954,9 @@ class GTEST_API_ TestSuite {
   // Runs every test in this TestSuite.
   void Run();
 
+  // Skips the execution of tests under this TestSuite
+  void Skip();
+
   // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
   // for catching exceptions thrown from SetUpTestSuite().
   void RunSetUpTestSuite() {
@@ -1535,14 +1549,6 @@ AssertionResult CmpHelperEQ(const char* lhs_expression,
   return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
 
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
-                                       const char* rhs_expression,
-                                       BiggestInt lhs,
-                                       BiggestInt rhs);
-
 class EqHelper {
  public:
   // This templatized version is for the general case.
@@ -1599,11 +1605,6 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
 // ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
 // of similar code.
 //
-// For each templatized helper function, we also define an overloaded
-// version for BiggestInt in order to reduce code bloat and allow
-// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
-// with gcc 4.
-//
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
 #define GTEST_IMPL_CMP_HELPER_(op_name, op)\
@@ -1615,22 +1616,20 @@ AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
   } else {\
     return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
   }\
-}\
-GTEST_API_ AssertionResult CmpHelper##op_name(\
-    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+}
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
 // Implements the helper function for {ASSERT|EXPECT}_NE
-GTEST_IMPL_CMP_HELPER_(NE, !=);
+GTEST_IMPL_CMP_HELPER_(NE, !=)
 // Implements the helper function for {ASSERT|EXPECT}_LE
-GTEST_IMPL_CMP_HELPER_(LE, <=);
+GTEST_IMPL_CMP_HELPER_(LE, <=)
 // Implements the helper function for {ASSERT|EXPECT}_LT
-GTEST_IMPL_CMP_HELPER_(LT, <);
+GTEST_IMPL_CMP_HELPER_(LT, <)
 // Implements the helper function for {ASSERT|EXPECT}_GE
-GTEST_IMPL_CMP_HELPER_(GE, >=);
+GTEST_IMPL_CMP_HELPER_(GE, >=)
 // Implements the helper function for {ASSERT|EXPECT}_GT
-GTEST_IMPL_CMP_HELPER_(GT, >);
+GTEST_IMPL_CMP_HELPER_(GT, >)
 
 #undef GTEST_IMPL_CMP_HELPER_
 
@@ -1807,12 +1806,6 @@ class GTEST_API_ AssertHelper {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
 };
 
-enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW };
-
-GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color,
-                                                            const char* fmt,
-                                                            ...);
-
 }  // namespace internal
 
 // The pure interface class that all value-parameterized tests inherit from.
@@ -1969,19 +1962,38 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
+#define GTEST_EXPECT_TRUE(condition) \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
+#define GTEST_EXPECT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
+#define GTEST_ASSERT_TRUE(condition) \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
+#define GTEST_ASSERT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
+// Define these macros to 1 to omit the definition of the corresponding
+// EXPECT or ASSERT, which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_EXPECT_TRUE
+#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_EXPECT_FALSE
+#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_TRUE
+#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_FALSE
+#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition)
+#endif
+
 // Macros for testing equalities and inequalities.
 //
 //    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
@@ -2480,4 +2492,4 @@ inline int RUN_ALL_TESTS() {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_H_
diff --git a/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/third_party/googletest/src/include/gtest/gtest_pred_impl.h
index d514255c73..5029a9bb02 100644
--- a/third_party/googletest/src/include/gtest/gtest_pred_impl.h
+++ b/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -33,8 +33,8 @@
 // Implements a family of generic predicate assertion macros.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
 #include "gtest/gtest.h"
 
@@ -356,4 +356,4 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/third_party/googletest/src/include/gtest/gtest_prod.h b/third_party/googletest/src/include/gtest/gtest_prod.h
index e651671ebd..38b9d85a51 100644
--- a/third_party/googletest/src/include/gtest/gtest_prod.h
+++ b/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -31,8 +31,8 @@
 // Google C++ Testing and Mocking Framework definitions useful in production code.
 // GOOGLETEST_CM0003 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
 
 // When you need to test the private or protected members of a class,
 // use the FRIEND_TEST macro to declare your tests as friends of the
@@ -58,4 +58,4 @@
 #define FRIEND_TEST(test_case_name, test_name)\
 friend class test_case_name##_##test_name##_Test
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
index cd85d956d2..db02881c0c 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
+++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -31,7 +31,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
index eb4467abca..b9495d8378 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
+++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
@@ -36,7 +36,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/third_party/googletest/src/include/gtest/internal/custom/gtest.h
index 4c8e07be23..afaaf17ba2 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/gtest.h
+++ b/third_party/googletest/src/include/gtest/internal/custom/gtest.h
@@ -31,7 +31,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
index 68bd353061..490296dfad 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -33,8 +33,8 @@
 // death tests.  They are subject to change without notice.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 
 #include "gtest/gtest-matchers.h"
 #include "gtest/internal/gtest-internal.h"
@@ -301,4 +301,4 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
index c11b101516..0c033abc34 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -37,8 +37,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 
 #include "gtest/internal/gtest-string.h"
 
@@ -195,7 +195,7 @@ class GTEST_API_ FilePath {
 
   void Normalize();
 
-  // Returns a pointer to the last occurence of a valid path separator in
+  // Returns a pointer to the last occurrence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
   const char* FindLastPathSeparator() const;
@@ -208,4 +208,4 @@ class GTEST_API_ FilePath {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-internal.h
index 6bad8780b5..f8cbdbd81d 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-internal.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -34,8 +34,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 
 #include "gtest/internal/gtest-port.h"
 
@@ -90,7 +90,9 @@
 #define GTEST_STRINGIFY_HELPER_(name, ...) #name
 #define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, )
 
-namespace proto2 { class Message; }
+namespace proto2 {
+class MessageLite;
+}
 
 namespace testing {
 
@@ -285,7 +287,7 @@ class FloatingPoint {
   //
   // See the following article for more details on ULP:
   // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
-  static const size_t kMaxUlps = 4;
+  static const uint32_t kMaxUlps = 4;
 
   // Constructs a FloatingPoint from a raw floating-point number.
   //
@@ -518,6 +520,7 @@ struct SuiteApiResolver : T {
 
   static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename,
                                                         int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
     SetUpTearDownSuiteFuncType test_case_fp =
         GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
     SetUpTearDownSuiteFuncType test_suite_fp =
@@ -529,10 +532,16 @@ struct SuiteApiResolver : T {
         << filename << ":" << line_num;
 
     return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::SetUpTestSuite;
+#endif
   }
 
   static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename,
                                                            int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
     SetUpTearDownSuiteFuncType test_case_fp =
         GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
     SetUpTearDownSuiteFuncType test_suite_fp =
@@ -544,6 +553,11 @@ struct SuiteApiResolver : T {
         << filename << ":" << line_num;
 
     return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::TearDownTestSuite;
+#endif
   }
 };
 
@@ -552,11 +566,11 @@ struct SuiteApiResolver : T {
 //
 // Arguments:
 //
-//   test_suite_name:   name of the test suite
+//   test_suite_name:  name of the test suite
 //   name:             name of the test
-//   type_param        the name of the test's type parameter, or NULL if
+//   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
-//   value_param       text representation of the test's value parameter,
+//   value_param:      text representation of the test's value parameter,
 //                     or NULL if this is not a type-parameterized test.
 //   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
@@ -576,8 +590,6 @@ GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
 // and returns false.  None of pstr, *pstr, and prefix can be NULL.
 GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
 
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
@@ -809,8 +821,6 @@ class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
   }
 };
 
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 // Returns the current OS stack trace as an std::string.
 //
 // The maximum number of stack frames to be included is specified by
@@ -878,11 +888,34 @@ class GTEST_API_ Random {
 #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
   typename std::remove_const<typename std::remove_reference<T>::type>::type
 
-// IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true if and only if T is type proto2::Message or a subclass of it.
+// HasDebugStringAndShortDebugString<T>::value is a compile-time bool constant
+// that's true if and only if T has methods DebugString() and ShortDebugString()
+// that return std::string.
 template <typename T>
-struct IsAProtocolMessage
-    : public std::is_convertible<const T*, const ::proto2::Message*> {};
+class HasDebugStringAndShortDebugString {
+ private:
+  template <typename C>
+  static auto CheckDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().DebugString())>::type;
+  template <typename>
+  static std::false_type CheckDebugString(...);
+
+  template <typename C>
+  static auto CheckShortDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().ShortDebugString())>::type;
+  template <typename>
+  static std::false_type CheckShortDebugString(...);
+
+  using HasDebugStringType = decltype(CheckDebugString<T>(nullptr));
+  using HasShortDebugStringType = decltype(CheckShortDebugString<T>(nullptr));
+
+ public:
+  static constexpr bool value =
+      HasDebugStringType::value && HasShortDebugStringType::value;
+};
+
+template <typename T>
+constexpr bool HasDebugStringAndShortDebugString<T>::value;
 
 // When the compiler sees expression IsContainerTest<C>(0), if C is an
 // STL-style container class, the first overload of IsContainerTest
@@ -1118,8 +1151,6 @@ class NativeArray {
   const Element* array_;
   size_t size_;
   void (NativeArray::*clone_)(const Element*, size_t);
-
-  GTEST_DISALLOW_ASSIGN_(NativeArray);
 };
 
 // Backport of std::index_sequence.
@@ -1143,12 +1174,18 @@ struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
 // Backport of std::make_index_sequence.
 // It uses O(ln(N)) instantiation depth.
 template <size_t N>
-struct MakeIndexSequence
-    : DoubleSequence<N % 2 == 1, typename MakeIndexSequence<N / 2>::type,
+struct MakeIndexSequenceImpl
+    : DoubleSequence<N % 2 == 1, typename MakeIndexSequenceImpl<N / 2>::type,
                      N / 2>::type {};
 
 template <>
-struct MakeIndexSequence<0> : IndexSequence<> {};
+struct MakeIndexSequenceImpl<0> : IndexSequence<> {};
+
+template <size_t N>
+using MakeIndexSequence = typename MakeIndexSequenceImpl<N>::type;
+
+template <typename... T>
+using IndexSequenceFor = typename MakeIndexSequence<sizeof...(T)>::type;
 
 template <size_t>
 struct Ignore {
@@ -1174,6 +1211,8 @@ struct ElemFromList {
           static_cast<T (*)()>(nullptr)...));
 };
 
+struct FlatTupleConstructTag {};
+
 template <typename... T>
 class FlatTuple;
 
@@ -1184,7 +1223,9 @@ template <typename... T, size_t I>
 struct FlatTupleElemBase<FlatTuple<T...>, I> {
   using value_type = typename ElemFromList<I, T...>::type;
   FlatTupleElemBase() = default;
-  explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {}
+  template <typename Arg>
+  explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t)
+      : value(std::forward<Arg>(t)) {}
   value_type value;
 };
 
@@ -1196,8 +1237,30 @@ struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
     : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
   using Indices = IndexSequence<Idx...>;
   FlatTupleBase() = default;
-  explicit FlatTupleBase(T... t)
-      : FlatTupleElemBase<FlatTuple<T...>, Idx>(std::move(t))... {}
+  template <typename... Args>
+  explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args)
+      : FlatTupleElemBase<FlatTuple<T...>, Idx>(FlatTupleConstructTag{},
+                                                std::forward<Args>(args))... {}
+
+  template <size_t I>
+  const typename ElemFromList<I, T...>::type& Get() const {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <size_t I>
+  typename ElemFromList<I, T...>::type& Get() {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <typename F>
+  auto Apply(F&& f) -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
+
+  template <typename F>
+  auto Apply(F&& f) const -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
 };
 
 // Analog to std::tuple but with different tradeoffs.
@@ -1218,17 +1281,12 @@ class FlatTuple
 
  public:
   FlatTuple() = default;
-  explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {}
+  template <typename... Args>
+  explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args)
+      : FlatTuple::FlatTupleBase(tag, std::forward<Args>(args)...) {}
 
-  template <size_t I>
-  const typename ElemFromList<I, T...>::type& Get() const {
-    return static_cast<const FlatTupleElemBase<FlatTuple, I>*>(this)->value;
-  }
-
-  template <size_t I>
-  typename ElemFromList<I, T...>::type& Get() {
-    return static_cast<FlatTupleElemBase<FlatTuple, I>*>(this)->value;
-  }
+  using FlatTuple::FlatTupleBase::Apply;
+  using FlatTuple::FlatTupleBase::Get;
 };
 
 // Utility functions to be called with static_assert to induce deprecation
@@ -1261,6 +1319,22 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
 }  // namespace internal
 }  // namespace testing
 
+namespace std {
+// Some standard library implementations use `struct tuple_size` and some use
+// `class tuple_size`. Clang warns about the mismatch.
+// https://reviews.llvm.org/D55466
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <typename... Ts>
+struct tuple_size<testing::internal::FlatTuple<Ts...>>
+    : std::integral_constant<size_t, sizeof...(Ts)> {};
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}  // namespace std
+
 #define GTEST_MESSAGE_AT_(file, line, message, result_type) \
   ::testing::internal::AssertHelper(result_type, file, line, message) \
     = ::testing::Message()
@@ -1283,44 +1357,98 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
 // Suppress MSVC warning 4072 (unreachable code) for the code following
 // statement if it returns or throws (or doesn't return or throw in some
 // situations).
+// NOTE: The "else" is important to keep this expansion to prevent a top-level
+// "else" from attaching to our "if".
 #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
-  if (::testing::internal::AlwaysTrue()) { statement; }
+  if (::testing::internal::AlwaysTrue()) {                        \
+    statement;                                                    \
+  } else                     /* NOLINT */                         \
+    static_assert(true, "")  // User must have a semicolon after expansion.
 
-#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
-    bool gtest_caught_expected = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (expected_exception const&) { \
-      gtest_caught_expected = true; \
-    } \
-    catch (...) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws a different type."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-    if (!gtest_caught_expected) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws nothing."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
-      fail(gtest_msg.value)
+#if GTEST_HAS_EXCEPTIONS
+
+namespace testing {
+namespace internal {
+
+class NeverThrown {
+ public:
+  const char* what() const noexcept {
+    return "this exception should never be thrown";
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#if GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e))
+
+#else  // GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) \
+  std::string { "an std::exception-derived error" }
+
+#endif  // GTEST_HAS_RTTI
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)   \
+  catch (typename std::conditional<                                            \
+         std::is_same<typename std::remove_cv<typename std::remove_reference<  \
+                          expected_exception>::type>::type,                    \
+                      std::exception>::value,                                  \
+         const ::testing::internal::NeverThrown&, const std::exception&>::type \
+             e) {                                                              \
+    gtest_msg.value = "Expected: " #statement                                  \
+                      " throws an exception of type " #expected_exception      \
+                      ".\n  Actual: it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                               \
+    gtest_msg.value += " with description \"";                                 \
+    gtest_msg.value += e.what();                                               \
+    gtest_msg.value += "\".";                                                  \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);                \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail)              \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                             \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                    \
+    bool gtest_caught_expected = false;                                     \
+    try {                                                                   \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);            \
+    } catch (expected_exception const&) {                                   \
+      gtest_caught_expected = true;                                         \
+    }                                                                       \
+    GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)    \
+    catch (...) {                                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws a different type.";         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+    if (!gtest_caught_expected) {                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws nothing.";                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+  } else /*NOLINT*/                                                         \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__)                   \
+        : fail(gtest_msg.value.c_str())
 
 #if GTEST_HAS_EXCEPTIONS
 
-#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
-  catch (std::exception const& e) { \
-    gtest_msg.value = ( \
-      "it throws std::exception-derived exception with description: \"" \
-    ); \
-    gtest_msg.value += e.what(); \
-    gtest_msg.value += "\"."; \
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                \
+  catch (std::exception const& e) {                               \
+    gtest_msg.value = "it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                  \
+    gtest_msg.value += " with description \"";                    \
+    gtest_msg.value += e.what();                                  \
+    gtest_msg.value += "\".";                                     \
     goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
   }
 
@@ -1367,7 +1495,7 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
+// representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
   GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
   if (const ::testing::AssertionResult gtest_ar_ = \
@@ -1404,7 +1532,7 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
   class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
       : public parent_class {                                                 \
    public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                   \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;           \
     ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
     GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
                                                            test_name));       \
@@ -1429,4 +1557,4 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
               test_suite_name, test_name)>);                                  \
   void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
index 7f7a13bf84..c2ef6e3124 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -32,8 +32,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
 #include <ctype.h>
 
@@ -459,7 +459,7 @@ class ParameterizedTestSuiteInfoBase {
 
   // Base part of test suite name for display purposes.
   virtual const std::string& GetTestSuiteName() const = 0;
-  // Test case id to verify identity.
+  // Test suite id to verify identity.
   virtual TypeId GetTestSuiteTypeId() const = 0;
   // UnitTest class invokes this method to register tests in this
   // test suite right before running them in RUN_ALL_TESTS macro.
@@ -478,7 +478,7 @@ class ParameterizedTestSuiteInfoBase {
 //
 // Report a the name of a test_suit as safe to ignore
 // as the side effect of construction of this type.
-struct MarkAsIgnored {
+struct GTEST_API_ MarkAsIgnored {
   explicit MarkAsIgnored(const char* test_suite);
 };
 
@@ -507,11 +507,11 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
                                       CodeLocation code_location)
       : test_suite_name_(name), code_location_(code_location) {}
 
-  // Test case base name for display purposes.
+  // Test suite base name for display purposes.
   const std::string& GetTestSuiteName() const override {
     return test_suite_name_;
   }
-  // Test case id to verify identity.
+  // Test suite id to verify identity.
   TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
   // TEST_P macro uses AddTestPattern() to record information
   // about a single test in a LocalTestInfo structure.
@@ -520,9 +520,10 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
   // test suite base name and DoBar is test base name.
   void AddTestPattern(const char* test_suite_name, const char* test_base_name,
-                      TestMetaFactoryBase<ParamType>* meta_factory) {
-    tests_.push_back(std::shared_ptr<TestInfo>(
-        new TestInfo(test_suite_name, test_base_name, meta_factory)));
+                      TestMetaFactoryBase<ParamType>* meta_factory,
+                      CodeLocation code_location) {
+    tests_.push_back(std::shared_ptr<TestInfo>(new TestInfo(
+        test_suite_name, test_base_name, meta_factory, code_location)));
   }
   // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
   // about a generator.
@@ -589,7 +590,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
           MakeAndRegisterTestInfo(
               test_suite_name.c_str(), test_name_stream.GetString().c_str(),
               nullptr,  // No type parameter.
-              PrintToString(*param_it).c_str(), code_location_,
+              PrintToString(*param_it).c_str(), test_info->code_location,
               GetTestSuiteTypeId(),
               SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
               SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
@@ -610,14 +611,17 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   // with TEST_P macro.
   struct TestInfo {
     TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name,
-             TestMetaFactoryBase<ParamType>* a_test_meta_factory)
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory,
+             CodeLocation a_code_location)
         : test_suite_base_name(a_test_suite_base_name),
           test_base_name(a_test_base_name),
-          test_meta_factory(a_test_meta_factory) {}
+          test_meta_factory(a_test_meta_factory),
+          code_location(a_code_location) {}
 
     const std::string test_suite_base_name;
     const std::string test_base_name;
     const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+    const CodeLocation code_location;
   };
   using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
   // Records data received from INSTANTIATE_TEST_SUITE_P macros:
@@ -650,7 +654,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
 
     // Check for invalid characters
     for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!isalnum(name[index]) && name[index] != '_')
+      if (!IsAlNum(name[index]) && name[index] != '_')
         return false;
     }
 
@@ -779,10 +783,15 @@ internal::ParamGenerator<typename Container::value_type> ValuesIn(
 namespace internal {
 // Used in the Values() function to provide polymorphic capabilities.
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
 template <typename... Ts>
 class ValueArray {
  public:
-  ValueArray(Ts... v) : v_{std::move(v)...} {}
+  explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {}
 
   template <typename T>
   operator ParamGenerator<T>() const {  // NOLINT
@@ -798,6 +807,10 @@ class ValueArray {
   FlatTuple<Ts...> v_;
 };
 
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 template <typename... T>
 class CartesianProductGenerator
     : public ParamGeneratorInterface<::std::tuple<T...>> {
@@ -931,4 +944,4 @@ class CartesianProductHolder {
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
index d3239b25ba..dd845915e3 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -32,8 +32,8 @@
 // This header file defines the GTEST_OS_* macro.
 // It is separate from gtest-port.h so that custom/gtest-port.h can include it.
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
@@ -68,6 +68,7 @@
 # define GTEST_OS_OS2 1
 #elif defined __APPLE__
 # define GTEST_OS_MAC 1
+# include <TargetConditionals.h>
 # if TARGET_OS_IPHONE
 #  define GTEST_OS_IOS 1
 # endif
@@ -106,6 +107,8 @@
 #define GTEST_OS_ESP8266 1
 #elif defined ESP32
 #define GTEST_OS_ESP32 1
+#elif defined(__XTENSA__)
+#define GTEST_OS_XTENSA 1
 #endif  // __CYGWIN__
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port.h b/third_party/googletest/src/include/gtest/internal/gtest-port.h
index 60ff47164f..0953a781c0 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -40,8 +40,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
 // Environment-describing macros
 // -----------------------------
@@ -199,9 +199,18 @@
 //                                        suppressed (constant conditional).
 //   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
 //                                        is suppressed.
+//   GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter<std::any> or
+//                            UniversalPrinter<absl::any> specializations.
+//   GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter<std::optional>
+//   or
+//                                 UniversalPrinter<absl::optional>
+//                                 specializations.
 //   GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher<std::string_view> or
 //                                    Matcher<absl::string_view>
 //                                    specializations.
+//   GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter<std::variant> or
+//                                UniversalPrinter<absl::variant>
+//                                specializations.
 //
 // Synchronization:
 //   Mutex, MutexLock, ThreadLocal, GetThreadCount()
@@ -252,6 +261,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <cerrno>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
@@ -267,6 +278,7 @@
 #endif
 
 #include <iostream>  // NOLINT
+#include <locale>
 #include <memory>
 #include <string>  // NOLINT
 #include <tuple>
@@ -347,6 +359,10 @@ typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
 typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #endif
+#elif GTEST_OS_XTENSA
+#include <unistd.h>
+// Xtensa toolchains define strcasecmp in the string.h header instead of
+// strings.h. string.h is already included.
 #else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
@@ -367,7 +383,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // On Android, <regex.h> is only available starting with Gingerbread.
 #  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
 # else
-#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
 # endif
 #endif
 
@@ -452,7 +468,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // no support for it at least as recent as Froyo (2.2).
 #define GTEST_HAS_STD_WSTRING                                         \
   (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266))
+     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA))
 
 #endif  // GTEST_HAS_STD_WSTRING
 
@@ -577,7 +593,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // By default, we assume that stream redirection is supported on all
 // platforms except known mobile ones.
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
 #  define GTEST_HAS_STREAM_REDIRECTION 0
 # else
 #  define GTEST_HAS_STREAM_REDIRECTION 1
@@ -679,8 +695,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // A macro to disallow copy constructor and operator=
 // This should be used in the private: declarations for a class.
 #define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
-  type(type const &) = delete; \
-  GTEST_DISALLOW_ASSIGN_(type)
+  type(type const&) = delete;                 \
+  type& operator=(type const&) = delete
 
 // A macro to disallow move operator=
 // This should be used in the private: declarations for a class.
@@ -690,8 +706,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // A macro to disallow move constructor and operator=
 // This should be used in the private: declarations for a class.
 #define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
-  type(type &&) noexcept = delete; \
-  GTEST_DISALLOW_MOVE_ASSIGN_(type)
+  type(type&&) noexcept = delete;             \
+  type& operator=(type&&) noexcept = delete
 
 // Tell the compiler to warn about unused return values for functions declared
 // with this macro.  The macro should be used on function declarations
@@ -918,8 +934,6 @@ class GTEST_API_ RE {
   const char* full_pattern_;  // For FullMatch();
 
 # endif
-
-  GTEST_DISALLOW_ASSIGN_(RE);
 };
 
 #endif  // GTEST_USES_PCRE
@@ -1926,6 +1940,19 @@ inline bool IsUpper(char ch) {
 inline bool IsXDigit(char ch) {
   return isxdigit(static_cast<unsigned char>(ch)) != 0;
 }
+#ifdef __cpp_char8_t
+inline bool IsXDigit(char8_t ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+#endif
+inline bool IsXDigit(char16_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+inline bool IsXDigit(char32_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
 inline bool IsXDigit(wchar_t ch) {
   const unsigned char low_byte = static_cast<unsigned char>(ch);
   return ch == low_byte && isxdigit(low_byte) != 0;
@@ -1960,16 +1987,16 @@ namespace posix {
 typedef struct _stat StatStruct;
 
 # ifdef __BORLANDC__
-inline int IsATTY(int fd) { return isatty(fd); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return strdup(src); }
 # else  // !__BORLANDC__
 #  if GTEST_OS_WINDOWS_MOBILE
-inline int IsATTY(int /* fd */) { return 0; }
+inline int DoIsATTY(int /* fd */) { return 0; }
 #  else
-inline int IsATTY(int fd) { return _isatty(fd); }
+inline int DoIsATTY(int fd) { return _isatty(fd); }
 #  endif  // GTEST_OS_WINDOWS_MOBILE
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return _stricmp(s1, s2);
@@ -1994,7 +2021,7 @@ inline bool IsDir(const StatStruct& st) {
 typedef struct stat StatStruct;
 
 inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int Stat(const char* path, StatStruct* buf) {
   // stat function not implemented on ESP8266
   return 0;
@@ -2011,7 +2038,7 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 typedef struct stat StatStruct;
 
 inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return strcasecmp(s1, s2);
@@ -2022,6 +2049,17 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 #endif  // GTEST_OS_WINDOWS
 
+inline int IsATTY(int fd) {
+  // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout
+  // to a file on Linux), which is unexpected, so save the previous value, and
+  // restore it after the call.
+  int savedErrno = errno;
+  int isAttyValue = DoIsATTY(fd);
+  errno = savedErrno;
+
+  return isAttyValue;
+}
+
 // Functions deprecated by MSVC 8.0.
 
 GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
@@ -2030,11 +2068,20 @@ GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 // StrError() aren't needed on Windows CE at this time and thus not
 // defined there.
 
-#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA
 inline int ChDir(const char* dir) { return chdir(dir); }
 #endif
 inline FILE* FOpen(const char* path, const char* mode) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+  struct wchar_codecvt : public std::codecvt<wchar_t, char, std::mbstate_t> {};
+  std::wstring_convert<wchar_codecvt> converter;
+  std::wstring wide_path = converter.from_bytes(path);
+  std::wstring wide_mode = converter.from_bytes(mode);
+  return _wfopen(wide_path.c_str(), wide_mode.c_str());
+#else  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
   return fopen(path, mode);
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
 }
 #if !GTEST_OS_WINDOWS_MOBILE
 inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
@@ -2055,7 +2102,7 @@ inline const char* StrError(int errnum) { return strerror(errnum); }
 #endif
 inline const char* GetEnv(const char* name) {
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
   // We are on an embedded platform, which has no environment variables.
   static_cast<void>(name);  // To prevent 'unused argument' warning.
   return nullptr;
@@ -2191,7 +2238,8 @@ using TimeInMillis = int64_t;  // Represents time in milliseconds.
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
 // false.
-bool ParseInt32(const Message& src_text, const char* str, int32_t* value);
+GTEST_API_ bool ParseInt32(const Message& src_text, const char* str,
+                           int32_t* value);
 
 // Parses a bool/int32_t/string from the environment variable
 // corresponding to the given Google Test flag.
@@ -2223,6 +2271,64 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
 #endif  // !defined(GTEST_INTERNAL_DEPRECATED)
 
+#if GTEST_HAS_ABSL
+// Always use absl::any for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include "absl/types/any.h"
+namespace testing {
+namespace internal {
+using Any = ::absl::any;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<any>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::any for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include <any>
+namespace testing {
+namespace internal {
+using Any = ::std::any;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::any is not
+// supported.
+#endif  // __has_include(<any>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::optional for UniversalPrinter<> specializations if
+// googletest is built with absl support.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include "absl/types/optional.h"
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::absl::optional<T>;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<optional>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::optional for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include <optional>
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::std::optional<T>;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::optional is not
+// supported.
+#endif  // __has_include(<optional>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
 #if GTEST_HAS_ABSL
 // Always use absl::string_view for Matcher<> specializations if googletest
 // is built with absl support.
@@ -2251,4 +2357,33 @@ using StringView = ::std::string_view;
 # endif  // __has_include
 #endif  // GTEST_HAS_ABSL
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#if GTEST_HAS_ABSL
+// Always use absl::variant for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include "absl/types/variant.h"
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::absl::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<variant>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::variant for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include <variant>
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::std::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::variant is not supported.
+#endif  // __has_include(<variant>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-string.h b/third_party/googletest/src/include/gtest/internal/gtest-string.h
index 0b2a91a5dc..10f774f966 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-string.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -38,8 +38,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
 #ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
@@ -149,6 +149,9 @@ class GTEST_API_ String {
   // Formats an int value as "%02d".
   static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
 
+  // Formats an int value to given width with leading zeros.
+  static std::string FormatIntWidthN(int value, int width);
+
   // Formats an int value as "%X".
   static std::string FormatHexInt(int value);
 
@@ -169,4 +172,4 @@ GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
index 082fdad12c..b87a2e2cac 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -32,8 +32,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
 #include "gtest/internal/gtest-port.h"
 
@@ -64,38 +64,40 @@ inline std::string CanonicalizeForStdLibVersioning(std::string s) {
   return s;
 }
 
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+#if GTEST_HAS_RTTI
+// GetTypeName(const std::type_info&) returns a human-readable name of type T.
+inline std::string GetTypeName(const std::type_info& type) {
+  const char* const name = type.name();
+#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
   int status = 0;
   // gcc's implementation of typeid(T).name() mangles the type name,
   // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
   using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
+#endif  // GTEST_HAS_CXXABI_H_
   char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
   return CanonicalizeForStdLibVersioning(name_str);
-#  else
+#else
   return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
+#endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+}
+#endif  // GTEST_HAS_RTTI
 
+// GetTypeName<T>() returns a human-readable name of type T if and only if
+// RTTI is enabled, otherwise it returns a dummy type name.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+  return GetTypeName(typeid(T));
+#else
   return "<type>";
-
-# endif  // GTEST_HAS_RTTI
+#endif  // GTEST_HAS_RTTI
 }
 
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 // A unique type indicating an empty node
 struct None {};
 
@@ -171,8 +173,6 @@ struct GenerateTypeList {
   using type = typename proxy::type;
 };
 
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 }  // namespace internal
 
 template <typename... Ts>
@@ -180,4 +180,4 @@ using Types = internal::ProxyTypeList<Ts...>;
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/third_party/googletest/src/src/gtest-death-test.cc b/third_party/googletest/src/src/gtest-death-test.cc
index 5d1031bea2..bf4f6331da 100644
--- a/third_party/googletest/src/src/gtest-death-test.cc
+++ b/third_party/googletest/src/src/gtest-death-test.cc
@@ -32,6 +32,7 @@
 
 #include "gtest/gtest-death-test.h"
 
+#include <functional>
 #include <utility>
 
 #include "gtest/internal/gtest-port.h"
@@ -247,7 +248,7 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
     msg << "detected " << thread_count << " threads.";
   }
   msg << " See "
-         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "https://github.com/google/googletest/blob/master/docs/"
          "advanced.md#death-tests-and-threads"
       << " for more explanation and suggested solutions, especially if"
       << " this is the last message you see before your test times out.";
@@ -864,7 +865,7 @@ class Arguments {
   }
 
   int size() {
-    return args_.size() - 1;
+    return static_cast<int>(args_.size()) - 1;
   }
 
  private:
@@ -890,18 +891,17 @@ int FuchsiaDeathTest::Wait() {
 
   // Register to wait for the child process to terminate.
   status_zx = child_process_.wait_async(
-      port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE);
+      port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the socket to be readable or closed.
   status_zx = stderr_socket_.wait_async(
-      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-      ZX_WAIT_ASYNC_ONCE);
+      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for an exception.
   status_zx = exception_channel_.wait_async(
-      port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE);
+      port, kExceptionKey, ZX_CHANNEL_READABLE, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   bool process_terminated = false;
@@ -941,8 +941,7 @@ int FuchsiaDeathTest::Wait() {
         } else {
           GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
           status_zx = stderr_socket_.wait_async(
-              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-              ZX_WAIT_ASYNC_ONCE);
+              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
           GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
         }
       } else {
@@ -955,12 +954,12 @@ int FuchsiaDeathTest::Wait() {
   ReadAndInterpretStatusByte();
 
   zx_info_process_t buffer;
-  status_zx = child_process_.get_info(
-      ZX_INFO_PROCESS, &buffer, sizeof(buffer), nullptr, nullptr);
+  status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer),
+                                      nullptr, nullptr);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
-  GTEST_DEATH_TEST_CHECK_(buffer.exited);
-  set_status(buffer.return_code);
+  GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED);
+  set_status(static_cast<int>(buffer.return_code));
   return status();
 }
 
@@ -1225,21 +1224,9 @@ struct ExecDeathTestArgs {
   int close_fd;       // File descriptor to close; the read end of a pipe
 };
 
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#  else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
+#  if GTEST_OS_QNX
 extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
-
-#  if !GTEST_OS_QNX
+#  else  // GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
@@ -1259,18 +1246,18 @@ static int ExecDeathTestChildMain(void* child_arg) {
     return EXIT_FAILURE;
   }
 
-  // We can safely call execve() as it's a direct system call.  We
+  // We can safely call execv() as it's almost a direct system call. We
   // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // unsafe.  Since execv() doesn't search the PATH, the user must
   // invoke the test program via a valid path that contains at least
   // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+  execv(args->argv[0], args->argv);
+  DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
                  original_dir + " failed: " +
                  GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
-#  endif  // !GTEST_OS_QNX
+#  endif  // GTEST_OS_QNX
 
 #  if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
@@ -1284,19 +1271,24 @@ static int ExecDeathTestChildMain(void* child_arg) {
 // correct answer.
 static void StackLowerThanAddress(const void* ptr,
                                   bool* result) GTEST_NO_INLINE_;
+// Make sure sanitizers do not tamper with the stack here.
+// Ideally, we want to use `__builtin_frame_address` instead of a local variable
+// address with sanitizer disabled, but it does not work when the
+// compiler optimizes the stack frame out, which happens on PowerPC targets.
 // HWAddressSanitizer add a random tag to the MSB of the local variable address,
 // making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 static void StackLowerThanAddress(const void* ptr, bool* result) {
-  int dummy;
-  *result = (&dummy < ptr);
+  int dummy = 0;
+  *result = std::less<const void*>()(&dummy, ptr);
 }
 
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 static bool StackGrowsDown() {
-  int dummy;
+  int dummy = 0;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
@@ -1339,8 +1331,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
                                         fd_flags | FD_CLOEXEC));
   struct inheritance inherit = {0};
   // spawn is a system call.
-  child_pid =
-      spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron());
+  child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
   // Restores the current working directory.
   GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
diff --git a/third_party/googletest/src/src/gtest-filepath.cc b/third_party/googletest/src/src/gtest-filepath.cc
index 9aad12fbd1..0b5629401b 100644
--- a/third_party/googletest/src/src/gtest-filepath.cc
+++ b/third_party/googletest/src/src/gtest-filepath.cc
@@ -92,8 +92,9 @@ static bool IsPathSeparator(char c) {
 
 // Returns the current working directory, or "" if unsuccessful.
 FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE ||         \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \
+    GTEST_OS_XTENSA
   // These platforms do not have a current directory, so we just return
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
@@ -209,7 +210,7 @@ bool FilePath::FileOrDirectoryExists() const {
   delete [] unicode;
   return attributes != kInvalidFileAttributes;
 #else
-  posix::StatStruct file_stat;
+  posix::StatStruct file_stat{};
   return posix::Stat(pathname_.c_str(), &file_stat) == 0;
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -236,7 +237,7 @@ bool FilePath::DirectoryExists() const {
     result = true;
   }
 #else
-  posix::StatStruct file_stat;
+  posix::StatStruct file_stat{};
   result = posix::Stat(path.c_str(), &file_stat) == 0 &&
       posix::IsDir(file_stat);
 #endif  // GTEST_OS_WINDOWS_MOBILE
@@ -323,7 +324,7 @@ bool FilePath::CreateFolder() const {
   delete [] unicode;
 #elif GTEST_OS_WINDOWS
   int result = _mkdir(pathname_.c_str());
-#elif GTEST_OS_ESP8266
+#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
   // do nothing
   int result = 0;
 #else
@@ -349,33 +350,19 @@ FilePath FilePath::RemoveTrailingPathSeparator() const {
 // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
 // redundancies that might be in a pathname involving "." or "..".
 void FilePath::Normalize() {
-  if (pathname_.c_str() == nullptr) {
-    pathname_ = "";
-    return;
-  }
-  const char* src = pathname_.c_str();
-  char* const dest = new char[pathname_.length() + 1];
-  char* dest_ptr = dest;
-  memset(dest_ptr, 0, pathname_.length() + 1);
-
-  while (*src != '\0') {
-    *dest_ptr = *src;
-    if (!IsPathSeparator(*src)) {
-      src++;
+  auto out = pathname_.begin();
+
+  for (const char character : pathname_) {
+    if (!IsPathSeparator(character)) {
+      *(out++) = character;
+    } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) {
+      *(out++) = kPathSeparator;
     } else {
-#if GTEST_HAS_ALT_PATH_SEP_
-      if (*dest_ptr == kAlternatePathSeparator) {
-        *dest_ptr = kPathSeparator;
-      }
-#endif
-      while (IsPathSeparator(*src))
-        src++;
+      continue;
     }
-    dest_ptr++;
   }
-  *dest_ptr = '\0';
-  pathname_ = dest;
-  delete[] dest;
+
+  pathname_.erase(out, pathname_.end());
 }
 
 }  // namespace internal
diff --git a/third_party/googletest/src/src/gtest-internal-inl.h b/third_party/googletest/src/src/gtest-internal-inl.h
index e42ff47539..6d8cecbbb3 100644
--- a/third_party/googletest/src/src/gtest-internal-inl.h
+++ b/third_party/googletest/src/src/gtest-internal-inl.h
@@ -31,8 +31,8 @@
 // This file contains purely Google Test's internal implementation.  Please
 // DO NOT #INCLUDE IT IN A USER PROGRAM.
 
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
+#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
 
 #ifndef _WIN32_WCE
 # include <errno.h>
@@ -84,9 +84,11 @@ const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
 const char kBreakOnFailureFlag[] = "break_on_failure";
 const char kCatchExceptionsFlag[] = "catch_exceptions";
 const char kColorFlag[] = "color";
+const char kFailFast[] = "fail_fast";
 const char kFilterFlag[] = "filter";
 const char kListTestsFlag[] = "list_tests";
 const char kOutputFlag[] = "output";
+const char kBriefFlag[] = "brief";
 const char kPrintTimeFlag[] = "print_time";
 const char kPrintUTF8Flag[] = "print_utf8";
 const char kRandomSeedFlag[] = "random_seed";
@@ -164,10 +166,12 @@ class GTestFlagSaver {
     color_ = GTEST_FLAG(color);
     death_test_style_ = GTEST_FLAG(death_test_style);
     death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    fail_fast_ = GTEST_FLAG(fail_fast);
     filter_ = GTEST_FLAG(filter);
     internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
     list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
+    brief_ = GTEST_FLAG(brief);
     print_time_ = GTEST_FLAG(print_time);
     print_utf8_ = GTEST_FLAG(print_utf8);
     random_seed_ = GTEST_FLAG(random_seed);
@@ -187,9 +191,11 @@ class GTestFlagSaver {
     GTEST_FLAG(death_test_style) = death_test_style_;
     GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
     GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(fail_fast) = fail_fast_;
     GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
     GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
+    GTEST_FLAG(brief) = brief_;
     GTEST_FLAG(print_time) = print_time_;
     GTEST_FLAG(print_utf8) = print_utf8_;
     GTEST_FLAG(random_seed) = random_seed_;
@@ -208,10 +214,12 @@ class GTestFlagSaver {
   std::string color_;
   std::string death_test_style_;
   bool death_test_use_fork_;
+  bool fail_fast_;
   std::string filter_;
   std::string internal_run_death_test_;
   bool list_tests_;
   std::string output_;
+  bool brief_;
   bool print_time_;
   bool print_utf8_;
   int32_t random_seed_;
@@ -386,13 +394,6 @@ class GTEST_API_ UnitTestOptions {
 
   // Functions for processing the gtest_filter flag.
 
-  // Returns true if and only if the wildcard pattern matches the string.
-  // The first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
   // Returns true if and only if the user-specified filter matches the test
   // suite name and the test name.
   static bool FilterMatchesTest(const std::string& test_suite_name,
@@ -647,10 +648,10 @@ class GTEST_API_ UnitTestImpl {
   // Arguments:
   //
   //   test_suite_name: name of the test suite
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test suite
-  //   tear_down_tc:   pointer to the function that tears down the test suite
+  //   type_param:      the name of the test's type parameter, or NULL if
+  //                    this is not a typed or a type-parameterized test.
+  //   set_up_tc:       pointer to the function that sets up the test suite
+  //   tear_down_tc:    pointer to the function that tears down the test suite
   TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param,
                           internal::SetUpTestSuiteFunc set_up_tc,
                           internal::TearDownTestSuiteFunc tear_down_tc);
@@ -674,6 +675,7 @@ class GTEST_API_ UnitTestImpl {
   void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
                    internal::TearDownTestSuiteFunc tear_down_tc,
                    TestInfo* test_info) {
+#if GTEST_HAS_DEATH_TEST
     // In order to support thread-safe death tests, we need to
     // remember the original working directory when the test program
     // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
@@ -686,6 +688,7 @@ class GTEST_API_ UnitTestImpl {
       GTEST_CHECK_(!original_working_dir_.IsEmpty())
           << "Failed to get the current working directory.";
     }
+#endif  // GTEST_HAS_DEATH_TEST
 
     GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
                  set_up_tc, tear_down_tc)
@@ -1161,13 +1164,13 @@ class StreamingListener : public EmptyTestEventListener {
   }
 
   // Note that "event=TestCaseStart" is a wire format and has to remain
-  // "case" for compatibilty
+  // "case" for compatibility
   void OnTestCaseStart(const TestCase& test_case) override {
     SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
   }
 
   // Note that "event=TestCaseEnd" is a wire format and has to remain
-  // "case" for compatibilty
+  // "case" for compatibility
   void OnTestCaseEnd(const TestCase& test_case) override {
     SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
            "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
@@ -1215,4 +1218,4 @@ class StreamingListener : public EmptyTestEventListener {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#endif  // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/third_party/googletest/src/src/gtest-port.cc b/third_party/googletest/src/src/gtest-port.cc
index a05c50a39b..53a4d37f97 100644
--- a/third_party/googletest/src/src/gtest-port.cc
+++ b/third_party/googletest/src/src/gtest-port.cc
@@ -198,7 +198,8 @@ size_t GetThreadCount() {
   if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
     return 0;
   }
-  mib[5] = size / mib[4];
+
+  mib[5] = static_cast<int>(size / static_cast<size_t>(mib[4]));
 
   // populate array of structs
   struct kinfo_proc info[mib[5]];
@@ -207,8 +208,8 @@ size_t GetThreadCount() {
   }
 
   // exclude empty members
-  int nthreads = 0;
-  for (int i = 0; i < size / mib[4]; i++) {
+  size_t nthreads = 0;
+  for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
     if (info[i].p_tid != -1)
       nthreads++;
   }
@@ -687,8 +688,8 @@ class ThreadLocalRegistryImpl {
   static Mutex thread_map_mutex_;
 };
 
-Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);  // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);  // NOLINT
 
 ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
       const ThreadLocalBase* thread_local_instance) {
@@ -1094,9 +1095,9 @@ class CapturedStream {
     filename_ = temp_file_path;
 # else
     // There's no guarantee that a test has write access to the current
-    // directory, so we create the temporary file in the /tmp directory
-    // instead. We use /tmp on most systems, and /sdcard on Android.
-    // That's because Android doesn't have /tmp.
+    // directory, so we create the temporary file in a temporary directory.
+    std::string name_template;
+
 #  if GTEST_OS_LINUX_ANDROID
     // Note: Android applications are expected to call the framework's
     // Context.getExternalStorageDirectory() method through JNI to get
@@ -1109,17 +1110,46 @@ class CapturedStream {
     // The location /data/local/tmp is directly accessible from native code.
     // '/sdcard' and other variants cannot be relied on, as they are not
     // guaranteed to be mounted, or may have a delay in mounting.
-    char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX";
+    name_template = "/data/local/tmp/";
+#  elif GTEST_OS_IOS
+    char user_temp_dir[PATH_MAX + 1];
+
+    // Documented alternative to NSTemporaryDirectory() (for obtaining creating
+    // a temporary directory) at
+    // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10
+    //
+    // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not
+    // documented in the confstr() man page at
+    // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr
+    // but are still available, according to the WebKit patches at
+    // https://trac.webkit.org/changeset/262004/webkit
+    // https://trac.webkit.org/changeset/263705/webkit
+    //
+    // The confstr() implementation falls back to getenv("TMPDIR"). See
+    // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html
+    ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir));
+
+    name_template = user_temp_dir;
+    if (name_template.back() != GTEST_PATH_SEP_[0])
+      name_template.push_back(GTEST_PATH_SEP_[0]);
 #  else
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-#  endif  // GTEST_OS_LINUX_ANDROID
-    const int captured_fd = mkstemp(name_template);
+    name_template = "/tmp/";
+#  endif
+    name_template.append("gtest_captured_stream.XXXXXX");
+
+    // mkstemp() modifies the string bytes in place, and does not go beyond the
+    // string's length. This results in well-defined behavior in C++17.
+    //
+    // The const_cast is needed below C++17. The constraints on std::string
+    // implementations in C++11 and above make assumption behind the const_cast
+    // fairly safe.
+    const int captured_fd = ::mkstemp(const_cast<char*>(name_template.data()));
     if (captured_fd == -1) {
       GTEST_LOG_(WARNING)
           << "Failed to create tmp file " << name_template
           << " for test; does the test have access to the /tmp directory?";
     }
-    filename_ = name_template;
+    filename_ = std::move(name_template);
 # endif  // GTEST_OS_WINDOWS
     fflush(nullptr);
     dup2(captured_fd, fd_);
diff --git a/third_party/googletest/src/src/gtest-printers.cc b/third_party/googletest/src/src/gtest-printers.cc
index 3337be312e..1b68fcb500 100644
--- a/third_party/googletest/src/src/gtest-printers.cc
+++ b/third_party/googletest/src/src/gtest-printers.cc
@@ -42,11 +42,16 @@
 // defines Foo.
 
 #include "gtest/gtest-printers.h"
+
 #include <stdio.h>
+
 #include <cctype>
+#include <cstdint>
 #include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
+#include <type_traits>
+
 #include "gtest/internal/gtest-port.h"
 #include "src/gtest-internal-inl.h"
 
@@ -102,9 +107,19 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
   *os << ">";
 }
 
+// Helpers for widening a character to char32_t. Since the standard does not
+// specify if char / wchar_t is signed or unsigned, it is important to first
+// convert it to the unsigned type of the same width before widening it to
+// char32_t.
+template <typename CharType>
+char32_t ToChar32(CharType in) {
+  return static_cast<char32_t>(
+      static_cast<typename std::make_unsigned<CharType>::type>(in));
+}
+
 }  // namespace
 
-namespace internal2 {
+namespace internal {
 
 // Delegates to PrintBytesInObjectToImpl() to print the bytes in the
 // given object.  The delegation simplifies the implementation, which
@@ -116,10 +131,6 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
   PrintBytesInObjectToImpl(obj_bytes, count, os);
 }
 
-}  // namespace internal2
-
-namespace internal {
-
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
@@ -134,18 +145,15 @@ enum CharFormat {
 // Returns true if c is a printable ASCII character.  We test the
 // value of c directly instead of calling isprint(), which is buggy on
 // Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) {
-  return 0x20 <= c && c <= 0x7E;
-}
+inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; }
 
-// Prints a wide or narrow char c as a character literal without the
-// quotes, escaping it when necessary; returns how c was formatted.
-// The template argument UnsignedChar is the unsigned version of Char,
-// which is the type of c.
-template <typename UnsignedChar, typename Char>
+// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a
+// character literal without the quotes, escaping it when necessary; returns how
+// c was formatted.
+template <typename Char>
 static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
-  wchar_t w_c = static_cast<wchar_t>(c);
-  switch (w_c) {
+  const char32_t u_c = ToChar32(c);
+  switch (u_c) {
     case L'\0':
       *os << "\\0";
       break;
@@ -177,13 +185,12 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
       *os << "\\v";
       break;
     default:
-      if (IsPrintableAscii(w_c)) {
+      if (IsPrintableAscii(u_c)) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
         ostream::fmtflags flags = os->flags();
-        *os << "\\x" << std::hex << std::uppercase
-            << static_cast<int>(static_cast<UnsignedChar>(c));
+        *os << "\\x" << std::hex << std::uppercase << static_cast<int>(u_c);
         os->flags(flags);
         return kHexEscape;
       }
@@ -191,9 +198,9 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
   return kSpecialEscape;
 }
 
-// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// Prints a char32_t c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
   switch (c) {
     case L'\'':
       *os << "'";
@@ -202,26 +209,68 @@ static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
       *os << "\\\"";
       return kSpecialEscape;
     default:
-      return PrintAsCharLiteralTo<wchar_t>(c, os);
+      return PrintAsCharLiteralTo(c, os);
   }
 }
 
+static const char* GetCharWidthPrefix(char) {
+  return "";
+}
+
+static const char* GetCharWidthPrefix(signed char) {
+  return "";
+}
+
+static const char* GetCharWidthPrefix(unsigned char) {
+  return "";
+}
+
+#ifdef __cpp_char8_t
+static const char* GetCharWidthPrefix(char8_t) {
+  return "u8";
+}
+#endif
+
+static const char* GetCharWidthPrefix(char16_t) {
+  return "u";
+}
+
+static const char* GetCharWidthPrefix(char32_t) {
+  return "U";
+}
+
+static const char* GetCharWidthPrefix(wchar_t) {
+  return "L";
+}
+
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
 static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
-  return PrintAsStringLiteralTo(
-      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+#ifdef __cpp_char8_t
+static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+#endif
+
+static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
 }
 
-// Prints a wide or narrow character c and its code.  '\0' is printed
-// as "'\\0'", other unprintable characters are also properly escaped
-// using the standard C++ escape sequence.  The template argument
-// UnsignedChar is the unsigned version of Char, which is the type of c.
-template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t)
+// and its code. '\0' is printed as "'\\0'", other unprintable characters are
+// also properly escaped using the standard C++ escape sequence.
+template <typename Char>
 void PrintCharAndCodeTo(Char c, ostream* os) {
   // First, print c as a literal in the most readable form we can find.
-  *os << ((sizeof(c) > 1) ? "L'" : "'");
-  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << GetCharWidthPrefix(c) << "'";
+  const CharFormat format = PrintAsCharLiteralTo(c, os);
   *os << "'";
 
   // To aid user debugging, we also print c's code in decimal, unless
@@ -242,21 +291,21 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
   *os << ")";
 }
 
-void PrintTo(unsigned char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-void PrintTo(signed char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
+void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
+void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
 
 // Prints a wchar_t as a symbol if it is printable or as its internal
 // code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream* os) {
-  PrintCharAndCodeTo<wchar_t>(wc, os);
+void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); }
+
+// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well.
+void PrintTo(char32_t c, ::std::ostream* os) {
+  *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4)
+      << static_cast<uint32_t>(c);
 }
 
 // Prints the given array of characters to the ostream.  CharType must be either
-// char or wchar_t.
+// char, char8_t, char16_t, char32_t, or wchar_t.
 // The array starts at begin, the length is len, it may include '\0' characters
 // and may not be NUL-terminated.
 template <typename CharType>
@@ -266,8 +315,8 @@ GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 static CharFormat PrintCharsAsStringTo(
     const CharType* begin, size_t len, ostream* os) {
-  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
-  *os << kQuoteBegin;
+  const char* const quote_prefix = GetCharWidthPrefix(*begin);
+  *os << quote_prefix << "\"";
   bool is_previous_hex = false;
   CharFormat print_format = kAsIs;
   for (size_t index = 0; index < len; ++index) {
@@ -276,7 +325,7 @@ static CharFormat PrintCharsAsStringTo(
       // Previous character is of '\x..' form and this character can be
       // interpreted as another hexadecimal digit in its number. Break string to
       // disambiguate.
-      *os << "\" " << kQuoteBegin;
+      *os << "\" " << quote_prefix << "\"";
     }
     is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
     // Remember if any characters required hex escaping.
@@ -322,22 +371,57 @@ void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
+#ifdef __cpp_char8_t
+// Prints a (const) char8_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+#endif
+
+// Prints a (const) char16_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) char32_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
 // Prints a (const) wchar_t array of 'len' elements, starting at address
 // 'begin'.
 void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
-// Prints the given C string to the ostream.
-void PrintTo(const char* s, ostream* os) {
+namespace {
+
+// Prints a null-terminated C-style string to the ostream.
+template <typename Char>
+void PrintCStringTo(const Char* s, ostream* os) {
   if (s == nullptr) {
     *os << "NULL";
   } else {
     *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, strlen(s), os);
+    PrintCharsAsStringTo(s, std::char_traits<Char>::length(s), os);
   }
 }
 
+}  // anonymous namespace
+
+void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); }
+
+#ifdef __cpp_char8_t
+void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); }
+#endif
+
+void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); }
+
+void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); }
+
 // MSVC compiler can be configured to define whar_t as a typedef
 // of unsigned short. Defining an overload for const wchar_t* in that case
 // would cause pointers to unsigned shorts be printed as wide strings,
@@ -346,14 +430,7 @@ void PrintTo(const char* s, ostream* os) {
 // wchar_t is implemented as a native type.
 #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
 // Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t* s, ostream* os) {
-  if (s == nullptr) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, wcslen(s), os);
-  }
-}
+void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
 #endif  // wchar_t is native
 
 namespace {
@@ -431,6 +508,20 @@ void PrintStringTo(const ::std::string& s, ostream* os) {
   }
 }
 
+#ifdef __cpp_char8_t
+void PrintU8StringTo(const ::std::u8string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif
+
+void PrintU16StringTo(const ::std::u16string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+void PrintU32StringTo(const ::std::u32string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
 #if GTEST_HAS_STD_WSTRING
 void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
   PrintCharsAsStringTo(s.data(), s.size(), os);
diff --git a/third_party/googletest/src/src/gtest-typed-test.cc b/third_party/googletest/src/src/gtest-typed-test.cc
index 1b1cfb0dc1..c02c3df659 100644
--- a/third_party/googletest/src/src/gtest-typed-test.cc
+++ b/third_party/googletest/src/src/gtest-typed-test.cc
@@ -35,8 +35,6 @@
 namespace testing {
 namespace internal {
 
-#if GTEST_HAS_TYPED_TEST_P
-
 // Skips to the first non-space char in str. Returns an empty string if str
 // contains only whitespace characters.
 static const char* SkipSpaces(const char* str) {
@@ -78,17 +76,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
       continue;
     }
 
-    bool found = false;
-    for (RegisteredTestIter it = registered_tests_.begin();
-         it != registered_tests_.end();
-         ++it) {
-      if (name == it->first) {
-        found = true;
-        break;
-      }
-    }
-
-    if (found) {
+    if (registered_tests_.count(name) != 0) {
       tests.insert(name);
     } else {
       errors << "No test named " << name
@@ -115,7 +103,5 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
   return registered_tests;
 }
 
-#endif  // GTEST_HAS_TYPED_TEST_P
-
 }  // namespace internal
 }  // namespace testing
diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc
index b8f6a5c31c..21c611aff1 100644
--- a/third_party/googletest/src/src/gtest.cc
+++ b/third_party/googletest/src/src/gtest.cc
@@ -35,7 +35,6 @@
 #include "gtest/gtest-spi.h"
 
 #include <ctype.h>
-#include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -44,6 +43,8 @@
 #include <wctype.h>
 
 #include <algorithm>
+#include <chrono>  // NOLINT
+#include <cmath>
 #include <cstdint>
 #include <iomanip>
 #include <limits>
@@ -55,8 +56,6 @@
 
 #if GTEST_OS_LINUX
 
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
 # include <fcntl.h>  // NOLINT
 # include <limits.h>  // NOLINT
 # include <sched.h>  // NOLINT
@@ -68,7 +67,6 @@
 # include <string>
 
 #elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
 # include <sys/time.h>  // NOLINT
 
 // On z/OS we additionally need strings.h for strcasecmp.
@@ -86,7 +84,6 @@
 
 #ifdef _MSC_VER
 # include <crtdbg.h>  // NOLINT
-# include <debugapi.h>  // NOLINT
 #endif
 
 # include <io.h>  // NOLINT
@@ -95,16 +92,11 @@
 # include <sys/stat.h>  // NOLINT
 
 # if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
 #  include <sys/time.h>  // NOLINT
 # endif  // GTEST_OS_WINDOWS_MINGW
 
 #else
 
-// Assume other platforms have gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
 // cpplint thinks that the header is already included, so we want to
 // silence it.
 # include <sys/time.h>  // NOLINT
@@ -213,6 +205,21 @@ static const char* GetDefaultFilter() {
   return kUniversalFilter;
 }
 
+// Bazel passes in the argument to '--test_runner_fail_fast' via the
+// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable.
+static bool GetDefaultFailFast() {
+  const char* const testbridge_test_runner_fail_fast =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST");
+  if (testbridge_test_runner_fail_fast != nullptr) {
+    return strcmp(testbridge_test_runner_fail_fast, "1") == 0;
+  }
+  return false;
+}
+
+GTEST_DEFINE_bool_(
+    fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()),
+    "True if and only if a test failure should stop further test execution.");
+
 GTEST_DEFINE_bool_(
     also_run_disabled_tests,
     internal::BoolFromGTestEnv("also_run_disabled_tests", false),
@@ -273,6 +280,10 @@ GTEST_DEFINE_string_(
     "executable's name and, if necessary, made unique by adding "
     "digits.");
 
+GTEST_DEFINE_bool_(
+    brief, internal::BoolFromGTestEnv("brief", false),
+    "True if only test failures should be displayed in text output.");
+
 GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
                    "True if and only if " GTEST_NAME_
                    " should display elapsed time in text output.");
@@ -479,7 +490,7 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
       "removed but the rest got left behind.";
 
   std::string message =
-      "Paramaterized test suite " + name +
+      "Parameterized test suite " + name +
       (has_test_p ? kMissingInstantiation : kMissingTestCase) +
       "\n\n"
       "To suppress this error for this test suite, insert the following line "
@@ -487,7 +498,7 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
       "\n\n"
       "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");";
 
-  std::string full_name = "UninstantiatedParamaterizedTestSuite<" + name + ">";
+  std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
   RegisterTest(  //
       "GoogleTestVerification", full_name.c_str(),
       nullptr,  // No type parameter.
@@ -534,7 +545,7 @@ void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
     if (ignored.find(testcase.first) != ignored.end()) continue;
 
     std::string message =
-        "Type paramaterized test suite " + testcase.first +
+        "Type parameterized test suite " + testcase.first +
         " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
         "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
         "\n\n"
@@ -544,13 +555,13 @@ void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
         "utilities.)"
         "\n\n"
         "To suppress this error for this test suite, insert the following line "
-        "(in a non-header) in the namespace it is definedin in:"
+        "(in a non-header) in the namespace it is defined in:"
         "\n\n"
         "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
         testcase.first + ");";
 
     std::string full_name =
-        "UninstantiatedTypeParamaterizedTestSuite<" + testcase.first + ">";
+        "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">";
     RegisterTest(  //
         "GoogleTestVerification", full_name.c_str(),
         nullptr,  // No type parameter.
@@ -635,47 +646,82 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
   return result.string();
 }
 
-// Returns true if and only if the wildcard pattern matches the string.
-// The first ':' or '\0' character in pattern marks the end of it.
+// Returns true if and only if the wildcard pattern matches the string. Each
+// pattern consists of regular characters, single-character wildcards (?), and
+// multi-character wildcards (*).
 //
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
-  }
-}
-
-bool UnitTestOptions::MatchesFilter(
-    const std::string& name, const char* filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
-      return true;
+// This function implements a linear-time string globbing algorithm based on
+// https://research.swtch.com/glob.
+static bool PatternMatchesString(const std::string& name_str,
+                                 const char* pattern, const char* pattern_end) {
+  const char* name = name_str.c_str();
+  const char* const name_begin = name;
+  const char* const name_end = name + name_str.size();
+
+  const char* pattern_next = pattern;
+  const char* name_next = name;
+
+  while (pattern < pattern_end || name < name_end) {
+    if (pattern < pattern_end) {
+      switch (*pattern) {
+        default:  // Match an ordinary character.
+          if (name < name_end && *name == *pattern) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '?':  // Match any single character.
+          if (name < name_end) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '*':
+          // Match zero or more characters. Start by skipping over the wildcard
+          // and matching zero characters from name. If that fails, restart and
+          // match one more character than the last attempt.
+          pattern_next = pattern;
+          name_next = name + 1;
+          ++pattern;
+          continue;
+      }
+    }
+    // Failed to match a character. Restart if possible.
+    if (name_begin < name_next && name_next <= name_end) {
+      pattern = pattern_next;
+      name = name_next;
+      continue;
     }
+    return false;
+  }
+  return true;
+}
 
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+                                    const char* filter) {
+  // The filter is a list of patterns separated by colons (:).
+  const char* pattern = filter;
+  while (true) {
+    // Find the bounds of this pattern.
+    const char* const next_sep = strchr(pattern, ':');
+    const char* const pattern_end =
+        next_sep != nullptr ? next_sep : pattern + strlen(pattern);
 
-    // Returns if no more pattern can be found.
-    if (cur_pattern == nullptr) {
-      return false;
+    // Check if this pattern matches name_str.
+    if (PatternMatchesString(name_str, pattern, pattern_end)) {
+      return true;
     }
 
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
+    // Give up on this pattern. However, if we found a pattern separator (:),
+    // advance to the next pattern (skipping over the separator) and restart.
+    if (next_sep == nullptr) {
+      return false;
+    }
+    pattern = next_sep + 1;
   }
+  return true;
 }
 
 // Returns true if and only if the user-specified filter matches the test
@@ -985,44 +1031,30 @@ std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
       );  // NOLINT
 }
 
-// Returns the current time in milliseconds.
+// A helper class for measuring elapsed times.
+class Timer {
+ public:
+  Timer() : start_(std::chrono::steady_clock::now()) {}
+
+  // Return time elapsed in milliseconds since the timer was created.
+  TimeInMillis Elapsed() {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+               std::chrono::steady_clock::now() - start_)
+        .count();
+  }
+
+ private:
+  std::chrono::steady_clock::time_point start_;
+};
+
+// Returns a timestamp as milliseconds since the epoch. Note this time may jump
+// around subject to adjustments by the system, to measure elapsed time use
+// Timer instead.
 TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
-
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
-  }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
-
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
-  _ftime64(&now);
-  GTEST_DISABLE_MSC_DEPRECATED_POP_()
-
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-# error "Don't know how to get the current time on your system."
-#endif
+  return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::system_clock::now() -
+             std::chrono::system_clock::from_time_t(0))
+      .count();
 }
 
 // Utilities
@@ -1537,6 +1569,31 @@ AssertionResult DoubleNearPredFormat(const char* expr1,
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
+  // Find the value which is closest to zero.
+  const double min_abs = std::min(fabs(val1), fabs(val2));
+  // Find the distance to the next double from that value.
+  const double epsilon =
+      nextafter(min_abs, std::numeric_limits<double>::infinity()) - min_abs;
+  // Detect the case where abs_error is so small that EXPECT_NEAR is
+  // effectively the same as EXPECT_EQUAL, and give an informative error
+  // message so that the situation can be more easily understood without
+  // requiring exotic floating-point knowledge.
+  // Don't do an epsilon check if abs_error is zero because that implies
+  // that an equality check was actually intended.
+  if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 &&
+      abs_error < epsilon) {
+    return AssertionFailure()
+           << "The difference between " << expr1 << " and " << expr2 << " is "
+           << diff << ", where\n"
+           << expr1 << " evaluates to " << val1 << ",\n"
+           << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter "
+           << abs_error_expr << " evaluates to " << abs_error
+           << " which is smaller than the minimum distance between doubles for "
+              "numbers of this magnitude which is "
+           << epsilon
+           << ", thus making this EXPECT_NEAR check equivalent to "
+              "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
+  }
   return AssertionFailure()
       << "The difference between " << expr1 << " and " << expr2
       << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
@@ -1599,57 +1656,6 @@ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 
 namespace internal {
 
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            BiggestInt lhs,
-                            BiggestInt rhs) {
-  if (lhs == rhs) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
-
-#undef GTEST_IMPL_CMP_HELPER_
-
 // The helper function for {ASSERT|EXPECT}_STREQ.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
                                const char* rhs_expression,
@@ -2123,8 +2129,13 @@ bool String::EndsWithCaseInsensitive(
 
 // Formats an int value as "%02d".
 std::string String::FormatIntWidth2(int value) {
+  return FormatIntWidthN(value, 2);
+}
+
+// Formats an int value to given width with leading zeros.
+std::string String::FormatIntWidthN(int value, int width) {
   std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << value;
+  ss << std::setfill('0') << std::setw(width) << value;
   return ss.str();
 }
 
@@ -2176,7 +2187,9 @@ std::string AppendUserMessage(const std::string& gtest_msg,
   if (user_msg_string.empty()) {
     return gtest_msg;
   }
-
+  if (gtest_msg.empty()) {
+    return user_msg_string;
+  }
   return gtest_msg + "\n" + user_msg_string;
 }
 
@@ -2228,7 +2241,7 @@ void TestResult::RecordProperty(const std::string& xml_element,
   if (!ValidateTestProperty(xml_element, test_property)) {
     return;
   }
-  internal::MutexLock lock(&test_properites_mutex_);
+  internal::MutexLock lock(&test_properties_mutex_);
   const std::vector<TestProperty>::iterator property_with_matching_key =
       std::find_if(test_properties_.begin(), test_properties_.end(),
                    internal::TestPropertyKeyIs(test_property.key()));
@@ -2255,7 +2268,8 @@ static const char* const kReservedTestSuitesAttributes[] = {
 // The list of reserved attributes used in the <testsuite> element of XML
 // output.
 static const char* const kReservedTestSuiteAttributes[] = {
-    "disabled", "errors", "failures", "name", "tests", "time", "timestamp"};
+    "disabled", "errors", "failures",  "name",
+    "tests",    "time",   "timestamp", "skipped"};
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
@@ -2268,7 +2282,7 @@ static const char* const kReservedOutputTestCaseAttributes[] = {
     "classname",   "name", "status", "time",   "type_param",
     "value_param", "file", "line",   "result", "timestamp"};
 
-template <int kSize>
+template <size_t kSize>
 std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
   return std::vector<std::string>(array, array + kSize);
 }
@@ -2712,6 +2726,7 @@ TestInfo::TestInfo(const std::string& a_test_suite_name,
       should_run_(false),
       is_disabled_(false),
       matches_filter_(false),
+      is_in_another_shard_(false),
       factory_(factory),
       result_() {}
 
@@ -2725,7 +2740,7 @@ namespace internal {
 //
 // Arguments:
 //
-//   test_suite_name:   name of the test suite
+//   test_suite_name:  name of the test suite
 //   name:             name of the test
 //   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
@@ -2827,7 +2842,8 @@ void TestInfo::Run() {
   // Notifies the unit test event listeners that a test is about to start.
   repeater->OnTestStart(*this);
 
-  const TimeInMillis start = internal::GetTimeInMillis();
+  result_.set_start_timestamp(internal::GetTimeInMillis());
+  internal::Timer timer;
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
 
@@ -2852,8 +2868,7 @@ void TestInfo::Run() {
         test, &Test::DeleteSelf_, "the test fixture's destructor");
   }
 
-  result_.set_start_timestamp(start);
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+  result_.set_elapsed_time(timer.Elapsed());
 
   // Notifies the unit test event listener that a test has just finished.
   repeater->OnTestEnd(*this);
@@ -2863,6 +2878,28 @@ void TestInfo::Run() {
   impl->set_current_test_info(nullptr);
 }
 
+// Skip and records a skipped test result for this object.
+void TestInfo::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TestPartResult test_part_result =
+      TestPartResult(TestPartResult::kSkip, this->file(), this->line(), "");
+  impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      test_part_result);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+  impl->set_current_test_info(nullptr);
+}
+
 // class TestSuite
 
 // Gets the number of successful tests in this test suite.
@@ -2909,7 +2946,7 @@ int TestSuite::total_test_count() const {
 //
 // Arguments:
 //
-//   name:         name of the test suite
+//   a_name:       name of the test suite
 //   a_type_param: the name of the test suite's type parameter, or NULL if
 //                 this is not a typed or a type-parameterized test suite.
 //   set_up_tc:    pointer to the function that sets up the test suite
@@ -2964,19 +3001,26 @@ void TestSuite::Run() {
   // Call both legacy and the new API
   repeater->OnTestSuiteStart(*this);
 //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   repeater->OnTestCaseStart(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
       this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 
   start_timestamp_ = internal::GetTimeInMillis();
+  internal::Timer timer;
   for (int i = 0; i < total_test_count(); i++) {
     GetMutableTestInfo(i)->Run();
+    if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) {
+      for (int j = i + 1; j < total_test_count(); j++) {
+        GetMutableTestInfo(j)->Skip();
+      }
+      break;
+    }
   }
-  elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_;
+  elapsed_time_ = timer.Elapsed();
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
@@ -2985,9 +3029,39 @@ void TestSuite::Run() {
   // Call both legacy and the new API
   repeater->OnTestSuiteEnd(*this);
 //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   repeater->OnTestCaseEnd(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  impl->set_current_test_suite(nullptr);
+}
+
+// Skips all tests under this TestSuite.
+void TestSuite::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Skip();
+  }
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+  // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseEnd(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   impl->set_current_test_suite(nullptr);
 }
@@ -3039,7 +3113,7 @@ static std::string FormatTestSuiteCount(int test_suite_count) {
 static const char * TestPartResultTypeToString(TestPartResult::Type type) {
   switch (type) {
     case TestPartResult::kSkip:
-      return "Skipped";
+      return "Skipped\n";
     case TestPartResult::kSuccess:
       return "Success";
 
@@ -3056,6 +3130,9 @@ static const char * TestPartResultTypeToString(TestPartResult::Type type) {
 }
 
 namespace internal {
+namespace {
+enum class GTestColor { kDefault, kRed, kGreen, kYellow };
+}  // namespace
 
 // Prints a TestPartResult to an std::string.
 static std::string PrintTestPartResultToString(
@@ -3093,9 +3170,12 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) {
 // Returns the character attribute for the given color.
 static WORD GetColorAttribute(GTestColor color) {
   switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    case GTestColor::kRed:
+      return FOREGROUND_RED;
+    case GTestColor::kGreen:
+      return FOREGROUND_GREEN;
+    case GTestColor::kYellow:
+      return FOREGROUND_RED | FOREGROUND_GREEN;
     default:           return 0;
   }
 }
@@ -3133,13 +3213,16 @@ static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
 
 #else
 
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// Returns the ANSI color code for the given color. GTestColor::kDefault is
 // an invalid input.
 static const char* GetAnsiColorCode(GTestColor color) {
   switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
+    case GTestColor::kRed:
+      return "1";
+    case GTestColor::kGreen:
+      return "2";
+    case GTestColor::kYellow:
+      return "3";
     default:
       return nullptr;
   }
@@ -3188,7 +3271,9 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // cannot simply emit special characters and have the terminal change colors.
 // This routine must actually emit the characters rather than return a string
 // that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+
+GTEST_ATTRIBUTE_PRINTF_(2, 3)
+static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
@@ -3198,7 +3283,7 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
 #else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+  const bool use_color = in_color_mode && (color != GTestColor::kDefault);
 #endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
 
   if (!use_color) {
@@ -3310,25 +3395,24 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
   if (!String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+    ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_,
+                  filter);
   }
 
   if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
     const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
+    ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n",
                   static_cast<int>(shard_index) + 1,
                   internal::posix::GetEnv(kTestTotalShards));
   }
 
   if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
+    ColoredPrintf(GTestColor::kYellow,
                   "Note: Randomizing tests' orders with a seed of %d .\n",
                   unit_test.random_seed());
   }
 
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
   printf("Running %s from %s.\n",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
@@ -3337,7 +3421,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
     const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("Global test environment set-up.\n");
   fflush(stdout);
 }
@@ -3346,7 +3430,7 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
 void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s", counts.c_str(), test_case.name());
   if (test_case.type_param() == nullptr) {
     printf("\n");
@@ -3360,7 +3444,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteStart(
     const TestSuite& test_suite) {
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s", counts.c_str(), test_suite.name());
   if (test_suite.type_param() == nullptr) {
     printf("\n");
@@ -3372,7 +3456,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteStart(
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  ColoredPrintf(GTestColor::kGreen, "[ RUN      ] ");
   PrintTestName(test_info.test_suite_name(), test_info.name());
   printf("\n");
   fflush(stdout);
@@ -3395,11 +3479,11 @@ void PrettyUnitTestResultPrinter::OnTestPartResult(
 
 void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
   if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+    ColoredPrintf(GTestColor::kGreen, "[       OK ] ");
   } else if (test_info.result()->Skipped()) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
   } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   }
   PrintTestName(test_info.test_suite_name(), test_info.name());
   if (test_info.result()->Failed())
@@ -3420,7 +3504,7 @@ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
 
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
          internal::StreamableToString(test_case.elapsed_time()).c_str());
   fflush(stdout);
@@ -3431,7 +3515,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
 
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
          internal::StreamableToString(test_suite.elapsed_time()).c_str());
   fflush(stdout);
@@ -3440,7 +3524,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
     const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("Global test environment tear-down\n");
   fflush(stdout);
 }
@@ -3448,7 +3532,7 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
 // Internal helper for printing the list of failed tests.
 void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
   const int failed_test_count = unit_test.failed_test_count();
-  ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+  ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
 
   for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
@@ -3461,7 +3545,7 @@ void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
       if (!test_info.should_run() || !test_info.result()->Failed()) {
         continue;
       }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
       printf("%s.%s", test_suite.name(), test_info.name());
       PrintFullTestCommentIfPresent(test_info);
       printf("\n");
@@ -3482,7 +3566,7 @@ void PrettyUnitTestResultPrinter::PrintFailedTestSuites(
       continue;
     }
     if (test_suite.ad_hoc_test_result().Failed()) {
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
       printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name());
       ++suite_failure_count;
     }
@@ -3510,7 +3594,7 @@ void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
       if (!test_info.should_run() || !test_info.result()->Skipped()) {
         continue;
       }
-      ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+      ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
       printf("%s.%s", test_suite.name(), test_info.name());
       printf("\n");
     }
@@ -3519,7 +3603,7 @@ void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
 
 void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                                                      int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
@@ -3528,12 +3612,12 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
   printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
   printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
 
   const int skipped_test_count = unit_test.skipped_test_count();
   if (skipped_test_count > 0) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
     printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
     PrintSkippedTests(unit_test);
   }
@@ -3548,10 +3632,8 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
   }
   // Ensure that Google Test output is printed before, e.g., heapchecker output.
   fflush(stdout);
@@ -3559,6 +3641,110 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
 
 // End PrettyUnitTestResultPrinter
 
+// This class implements the TestEventListener interface.
+//
+// Class BriefUnitTestResultPrinter is copyable.
+class BriefUnitTestResultPrinter : public TestEventListener {
+ public:
+  BriefUnitTestResultPrinter() {}
+  static void PrintTestName(const char* test_suite, const char* test) {
+    printf("%s.%s", test_suite, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {}
+
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+};
+
+// Called after an assertion failure.
+void BriefUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  switch (result.type()) {
+    // If the test part succeeded, we don't need to do anything.
+    case TestPartResult::kSuccess:
+      return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Failed()) {
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+    PrintTestName(test_info.test_suite_name(), test_info.name());
+    PrintFullTestCommentIfPresent(test_info);
+
+    if (GTEST_FLAG(print_time)) {
+      printf(" (%s ms)\n",
+             internal::StreamableToString(test_info.result()->elapsed_time())
+                 .c_str());
+    } else {
+      printf("\n");
+    }
+    fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                    int /*iteration*/) {
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
+    printf("%s.\n", FormatTestCount(skipped_test_count).c_str());
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (unit_test.Passed()) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End BriefUnitTestResultPrinter
+
 // class TestEventRepeater
 //
 // This class forwards events to other event listeners.
@@ -3742,6 +3928,16 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
   static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
 
+  // Streams a test suite XML stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputXmlTestSuiteForTestResult(::std::ostream* stream,
+                                              const TestResult& result);
+
+  // Streams an XML representation of a TestResult object.
+  static void OutputXmlTestResult(::std::ostream* stream,
+                                  const TestResult& result);
+
   // Streams an XML representation of a TestInfo object.
   static void OutputXmlTestInfo(::std::ostream* stream,
                                 const char* test_suite_name,
@@ -3900,6 +4096,10 @@ static bool PortableLocaltime(time_t seconds, struct tm* out) {
   if (tm_ptr == nullptr) return false;
   *out = *tm_ptr;
   return true;
+#elif defined(__STDC_LIB_EXT1__)
+  // Uses localtime_s when available as localtime_r is only available from
+  // C23 standard.
+  return localtime_s(&seconds, out) != nullptr;
 #else
   return localtime_r(&seconds, out) != nullptr;
 #endif
@@ -3911,13 +4111,14 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
   struct tm time_struct;
   if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
     return "";
-  // YYYY-MM-DDThh:mm:ss
+  // YYYY-MM-DDThh:mm:ss.sss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
       String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
       String::FormatIntWidth2(time_struct.tm_mday) + "T" +
       String::FormatIntWidth2(time_struct.tm_hour) + ":" +
       String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec);
+      String::FormatIntWidth2(time_struct.tm_sec) + "." +
+      String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
 }
 
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
@@ -3956,6 +4157,43 @@ void XmlUnitTestResultPrinter::OutputXmlAttribute(
   *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
 }
 
+// Streams a test suite XML stanza containing the given test result.
+void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a minimal test suite with one test.
+  *stream << "  <testsuite";
+  OutputXmlAttribute(stream, "testsuite", "name", "NonTestSuiteFailure");
+  OutputXmlAttribute(stream, "testsuite", "tests", "1");
+  OutputXmlAttribute(stream, "testsuite", "failures", "1");
+  OutputXmlAttribute(stream, "testsuite", "disabled", "0");
+  OutputXmlAttribute(stream, "testsuite", "skipped", "0");
+  OutputXmlAttribute(stream, "testsuite", "errors", "0");
+  OutputXmlAttribute(stream, "testsuite", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testsuite", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  *stream << ">";
+
+  // Output the boilerplate for a minimal test case with a single test.
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, "testcase", "name", "");
+  OutputXmlAttribute(stream, "testcase", "status", "run");
+  OutputXmlAttribute(stream, "testcase", "result", "completed");
+  OutputXmlAttribute(stream, "testcase", "classname", "");
+  OutputXmlAttribute(stream, "testcase", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testcase", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+
+  // Output the actual test result.
+  OutputXmlTestResult(stream, result);
+
+  // Complete the test suite.
+  *stream << "  </testsuite>\n";
+}
+
 // Prints an XML representation of a TestInfo object.
 void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
                                                  const char* test_suite_name,
@@ -3999,11 +4237,17 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
       FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
   OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
 
+  OutputXmlTestResult(stream, result);
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
+                                                   const TestResult& result) {
   int failures = 0;
+  int skips = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
     const TestPartResult& part = result.GetTestPartResult(i);
     if (part.failed()) {
-      if (++failures == 1) {
+      if (++failures == 1 && skips == 0) {
         *stream << ">\n";
       }
       const std::string location =
@@ -4011,18 +4255,31 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
                                                           part.line_number());
       const std::string summary = location + "\n" + part.summary();
       *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary.c_str())
+              << EscapeXmlAttribute(summary)
               << "\" type=\"\">";
       const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
+    } else if (part.skipped()) {
+      if (++skips == 1 && failures == 0) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <skipped message=\""
+              << EscapeXmlAttribute(summary.c_str()) << "\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</skipped>\n";
     }
   }
 
-  if (failures == 0 && result.test_property_count() == 0) {
+  if (failures == 0 && skips == 0 && result.test_property_count() == 0) {
     *stream << " />\n";
   } else {
-    if (failures == 0) {
+    if (failures == 0 && skips == 0) {
       *stream << ">\n";
     }
     OutputXmlTestProperties(stream, result);
@@ -4044,7 +4301,11 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
     OutputXmlAttribute(
         stream, kTestsuite, "disabled",
         StreamableToString(test_suite.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "skipped",
+                       StreamableToString(test_suite.skipped_test_count()));
+
     OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+
     OutputXmlAttribute(stream, kTestsuite, "time",
                        FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
     OutputXmlAttribute(
@@ -4095,6 +4356,13 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
     if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
       PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
   }
+
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
   *stream << "</" << kTestsuites << ">\n";
 }
 
@@ -4185,6 +4453,16 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
                             const std::string& indent,
                             bool comma = true);
 
+  // Streams a test suite JSON stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputJsonTestSuiteForTestResult(::std::ostream* stream,
+                                               const TestResult& result);
+
+  // Streams a JSON representation of a TestResult object.
+  static void OutputJsonTestResult(::std::ostream* stream,
+                                   const TestResult& result);
+
   // Streams a JSON representation of a TestInfo object.
   static void OutputJsonTestInfo(::std::ostream* stream,
                                  const char* test_suite_name,
@@ -4335,6 +4613,48 @@ void JsonUnitTestResultPrinter::OutputJsonKey(
     *stream << ",\n";
 }
 
+// Streams a test suite JSON stanza containing the given test result.
+void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a new test suite.
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
+  OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
+    OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "time",
+                  FormatTimeInMillisAsDuration(result.elapsed_time()),
+                  Indent(6));
+    OutputJsonKey(stream, "testsuite", "timestamp",
+                  FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                  Indent(6));
+  }
+  *stream << Indent(6) << "\"testsuite\": [\n";
+
+  // Output the boilerplate for a new test case.
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, "testcase", "name", "", Indent(10));
+  OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10));
+  OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10));
+  OutputJsonKey(stream, "testcase", "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false);
+  *stream << TestPropertiesAsJson(result, Indent(10));
+
+  // Output the actual test result.
+  OutputJsonTestResult(stream, result);
+
+  // Finish the test suite.
+  *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}";
+}
+
 // Prints a JSON representation of a TestInfo object.
 void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
                                                    const char* test_suite_name,
@@ -4377,6 +4697,13 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
                 false);
   *stream << TestPropertiesAsJson(result, kIndent);
 
+  OutputJsonTestResult(stream, result);
+}
+
+void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
+                                                     const TestResult& result) {
+  const std::string kIndent = Indent(10);
+
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
     const TestPartResult& part = result.GetTestPartResult(i);
@@ -4487,6 +4814,12 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
     }
   }
 
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
   *stream << "\n" << kIndent << "]\n" << "}\n";
 }
 
@@ -5309,6 +5642,10 @@ void UnitTestImpl::PostFlagParsingInit() {
     // to shut down the default XML output before invoking RUN_ALL_TESTS.
     ConfigureXmlOutput();
 
+    if (GTEST_FLAG(brief)) {
+      listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
+    }
+
 #if GTEST_CAN_STREAM_RESULTS_
     // Configures listeners for streaming test results to the specified server.
     ConfigureStreamingOutput();
@@ -5354,10 +5691,10 @@ class TestSuiteNameIs {
 // Arguments:
 //
 //   test_suite_name: name of the test suite
-//   type_param:     the name of the test suite's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test suite.
-//   set_up_tc:      pointer to the function that sets up the test suite
-//   tear_down_tc:   pointer to the function that tears down the test suite
+//   type_param:      the name of the test suite's type parameter, or NULL if
+//                    this is not a typed or a type-parameterized test suite.
+//   set_up_tc:       pointer to the function that sets up the test suite
+//   tear_down_tc:    pointer to the function that tears down the test suite
 TestSuite* UnitTestImpl::GetTestSuite(
     const char* test_suite_name, const char* type_param,
     internal::SetUpTestSuiteFunc set_up_tc,
@@ -5475,7 +5812,7 @@ bool UnitTestImpl::RunAllTests() {
     // assertions executed before RUN_ALL_TESTS().
     ClearNonAdHocTestResult();
 
-    const TimeInMillis start = GetTimeInMillis();
+    Timer timer;
 
     // Shuffles test suites and tests if requested.
     if (has_tests_to_run && GTEST_FLAG(shuffle)) {
@@ -5516,6 +5853,21 @@ bool UnitTestImpl::RunAllTests() {
         for (int test_index = 0; test_index < total_test_suite_count();
              test_index++) {
           GetMutableSuiteCase(test_index)->Run();
+          if (GTEST_FLAG(fail_fast) &&
+              GetMutableSuiteCase(test_index)->Failed()) {
+            for (int j = test_index + 1; j < total_test_suite_count(); j++) {
+              GetMutableSuiteCase(j)->Skip();
+            }
+            break;
+          }
+        }
+      } else if (Test::HasFatalFailure()) {
+        // If there was a fatal failure during the global setup then we know we
+        // aren't going to run any tests. Explicitly mark all of the tests as
+        // skipped to make this obvious in the output.
+        for (int test_index = 0; test_index < total_test_suite_count();
+             test_index++) {
+          GetMutableSuiteCase(test_index)->Skip();
         }
       }
 
@@ -5526,7 +5878,7 @@ bool UnitTestImpl::RunAllTests() {
       repeater->OnEnvironmentsTearDownEnd(*parent_);
     }
 
-    elapsed_time_ = GetTimeInMillis() - start;
+    elapsed_time_ = timer.Elapsed();
 
     // Tells the unit test event listener that the tests have just finished.
     repeater->OnTestIterationEnd(*parent_, i);
@@ -5554,14 +5906,14 @@ bool UnitTestImpl::RunAllTests() {
 
   if (!gtest_is_initialized_before_run_all_tests) {
     ColoredPrintf(
-        COLOR_RED,
+        GTestColor::kRed,
         "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
         "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
         "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
         " will start to enforce the valid usage. "
         "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
 #if GTEST_FOR_GOOGLE_
-    ColoredPrintf(COLOR_RED,
+    ColoredPrintf(GTestColor::kRed,
                   "For more details, see http://wiki/Main/ValidGUnitMain.\n");
 #endif  // GTEST_FOR_GOOGLE_
   }
@@ -5578,7 +5930,7 @@ void WriteToShardStatusFileIfNeeded() {
   if (test_shard_file != nullptr) {
     FILE* const file = posix::FOpen(test_shard_file, "w");
     if (file == nullptr) {
-      ColoredPrintf(COLOR_RED,
+      ColoredPrintf(GTestColor::kRed,
                     "Could not write to the test shard status file \"%s\" "
                     "specified by the %s environment variable.\n",
                     test_shard_file, kTestShardStatusFile);
@@ -5612,7 +5964,7 @@ bool ShouldShard(const char* total_shards_env,
       << "Invalid environment variables: you have "
       << kTestShardIndex << " = " << shard_index
       << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (total_shards != -1 && shard_index == -1) {
@@ -5620,7 +5972,7 @@ bool ShouldShard(const char* total_shards_env,
       << "Invalid environment variables: you have "
       << kTestTotalShards << " = " << total_shards
       << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (shard_index < 0 || shard_index >= total_shards) {
@@ -5629,7 +5981,7 @@ bool ShouldShard(const char* total_shards_env,
       << kTestShardIndex << " < " << kTestTotalShards
       << ", but you have " << kTestShardIndex << "=" << shard_index
       << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   }
@@ -6019,7 +6371,7 @@ static bool HasGoogleTestFlagPrefix(const char* str) {
 //   @D    changes to the default terminal text color.
 //
 static void PrintColorEncoded(const char* str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
+  GTestColor color = GTestColor::kDefault;  // The current color.
 
   // Conceptually, we split the string into segments divided by escape
   // sequences.  Then we print one segment at a time.  At the end of
@@ -6039,13 +6391,13 @@ static void PrintColorEncoded(const char* str) {
     if (ch == '@') {
       ColoredPrintf(color, "@");
     } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
+      color = GTestColor::kDefault;
     } else if (ch == 'R') {
-      color = COLOR_RED;
+      color = GTestColor::kRed;
     } else if (ch == 'G') {
-      color = COLOR_GREEN;
+      color = GTestColor::kGreen;
     } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
+      color = GTestColor::kYellow;
     } else {
       --str;
     }
@@ -6053,98 +6405,126 @@ static void PrintColorEncoded(const char* str) {
 }
 
 static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "This program contains tests written using " GTEST_NAME_
+    ". You can use the\n"
+    "following command line flags to control its behavior:\n"
+    "\n"
+    "Test Selection:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D\n"
+    "      List the names of all tests instead of running them. The name of\n"
+    "      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "filter=@YPOSITIVE_PATTERNS"
     "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate a JSON or XML report in the given directory or with the given\n"
-"      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
+    "      Run only the tests whose name matches one of the positive patterns "
+    "but\n"
+    "      none of the negative patterns. '?' matches any single character; "
+    "'*'\n"
+    "      matches any substring; ':' separates two patterns.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "also_run_disabled_tests@D\n"
+    "      Run all disabled tests too.\n"
+    "\n"
+    "Test Execution:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "repeat=@Y[COUNT]@D\n"
+    "      Run the tests repeatedly; use a negative count to repeat forever.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "shuffle@D\n"
+    "      Randomize tests' orders on every iteration.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "random_seed=@Y[NUMBER]@D\n"
+    "      Random number seed to use for shuffling test orders (between 1 and\n"
+    "      99999, or 0 to use a seed based on the current time).\n"
+    "\n"
+    "Test Output:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+    "      Enable/disable colored output. The default is @Gauto@D.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "brief=1@D\n"
+    "      Only print test failures.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "print_time=0@D\n"
+    "      Don't print the elapsed time of each test.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_
+    "@Y|@G:@YFILE_PATH]@D\n"
+    "      Generate a JSON or XML report in the given directory or with the "
+    "given\n"
+    "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
 # if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "stream_result_to=@YHOST@G:@YPORT@D\n"
+    "      Stream test results to the given server.\n"
 # endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
+    "\n"
+    "Assertion Behavior:\n"
 # if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+    "      Set the default death test style.\n"
 # endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions for use by an external\n"
-"      test framework.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "  @G--" GTEST_FLAG_PREFIX_
+    "break_on_failure@D\n"
+    "      Turn assertion failures into debugger break-points.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "throw_on_failure@D\n"
+    "      Turn assertion failures into C++ exceptions for use by an external\n"
+    "      test framework.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "catch_exceptions=0@D\n"
+    "      Do not report exceptions as test failures. Instead, allow them\n"
+    "      to crash the program or throw a pop-up (on Windows).\n"
+    "\n"
+    "Except for @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D, you can alternatively set "
     "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "environment variable of a flag (all letters in upper-case). For example, "
+    "to\n"
+    "disable colored text output, you can either specify "
+    "@G--" GTEST_FLAG_PREFIX_
     "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+    "the @G" GTEST_FLAG_PREFIX_UPPER_
+    "COLOR@D environment variable to @Gno@D.\n"
+    "\n"
+    "For more information, please read the " GTEST_NAME_
+    " documentation at\n"
+    "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_
+    "\n"
+    "(not one in your own code or tests), please report it to\n"
+    "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
 static bool ParseGoogleTestFlag(const char* const arg) {
   return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
                        &GTEST_FLAG(also_run_disabled_tests)) ||
-      ParseBoolFlag(arg, kBreakOnFailureFlag,
-                    &GTEST_FLAG(break_on_failure)) ||
-      ParseBoolFlag(arg, kCatchExceptionsFlag,
-                    &GTEST_FLAG(catch_exceptions)) ||
-      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-      ParseStringFlag(arg, kDeathTestStyleFlag,
-                      &GTEST_FLAG(death_test_style)) ||
-      ParseBoolFlag(arg, kDeathTestUseFork,
-                    &GTEST_FLAG(death_test_use_fork)) ||
-      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-      ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                      &GTEST_FLAG(internal_run_death_test)) ||
-      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
-      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-      ParseInt32Flag(arg, kStackTraceDepthFlag,
-                     &GTEST_FLAG(stack_trace_depth)) ||
-      ParseStringFlag(arg, kStreamResultToFlag,
-                      &GTEST_FLAG(stream_result_to)) ||
-      ParseBoolFlag(arg, kThrowOnFailureFlag,
-                    &GTEST_FLAG(throw_on_failure));
+         ParseBoolFlag(arg, kBreakOnFailureFlag,
+                       &GTEST_FLAG(break_on_failure)) ||
+         ParseBoolFlag(arg, kCatchExceptionsFlag,
+                       &GTEST_FLAG(catch_exceptions)) ||
+         ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+         ParseStringFlag(arg, kDeathTestStyleFlag,
+                         &GTEST_FLAG(death_test_style)) ||
+         ParseBoolFlag(arg, kDeathTestUseFork,
+                       &GTEST_FLAG(death_test_use_fork)) ||
+         ParseBoolFlag(arg, kFailFast, &GTEST_FLAG(fail_fast)) ||
+         ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+         ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                         &GTEST_FLAG(internal_run_death_test)) ||
+         ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+         ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+         ParseBoolFlag(arg, kBriefFlag, &GTEST_FLAG(brief)) ||
+         ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+         ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
+         ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+         ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+         ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+         ParseInt32Flag(arg, kStackTraceDepthFlag,
+                        &GTEST_FLAG(stack_trace_depth)) ||
+         ParseStringFlag(arg, kStreamResultToFlag,
+                         &GTEST_FLAG(stream_result_to)) ||
+         ParseBoolFlag(arg, kThrowOnFailureFlag, &GTEST_FLAG(throw_on_failure));
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -6314,24 +6694,31 @@ void InitGoogleTest() {
 std::string TempDir() {
 #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
   return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
+#elif GTEST_OS_WINDOWS_MOBILE
   return "\\temp\\";
 #elif GTEST_OS_WINDOWS
   const char* temp_dir = internal::posix::GetEnv("TEMP");
-  if (temp_dir == nullptr || temp_dir[0] == '\0')
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
     return "\\temp\\";
-  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+  } else if (temp_dir[strlen(temp_dir) - 1] == '\\') {
     return temp_dir;
-  else
+  } else {
     return std::string(temp_dir) + "\\";
+  }
 #elif GTEST_OS_LINUX_ANDROID
   const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0')
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
     return "/data/local/tmp/";
-  else
+  } else {
     return temp_dir;
+  }
+#elif GTEST_OS_LINUX
+  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
+    return "/tmp/";
+  } else {
+    return temp_dir;
+  }
 #else
   return "/tmp/";
 #endif  // GTEST_OS_WINDOWS_MOBILE

From 90749e866308a0667c0d9afcf90244e5c6c95c0a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 14 Apr 2022 13:08:58 -0700
Subject: [PATCH 257/926] temporal_filter_sse4,cosmetics: fix some typos

Change-Id: If8318068a32da52d15c0ba595f80092611f4c847
---
 vp9/encoder/x86/temporal_filter_sse4.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index bdbd66051d..87e68fb438 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -460,7 +460,7 @@ static void vp9_apply_temporal_filter_luma(
 
   if (block_width == 16) {
     // Special Case: The blockwidth is 16 and we are operating on a row of 16
-    // chroma pixels. In this case, we can't use the usualy left-midle-right
+    // chroma pixels. In this case, we can't use the usual left-middle-right
     // pattern. We also don't support splitting now.
     neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
     neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
@@ -553,7 +553,7 @@ static void vp9_apply_temporal_filter_chroma_8(
   // Loop variable
   unsigned int h;
 
-  // Initilize weight
+  // Initialize weight
   if (blk_fw) {
     weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
                             blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]);
@@ -827,12 +827,12 @@ void vp9_apply_temporal_filter_sse4_1(
   assert(
       (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
       "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
   assert(
       (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
       "subblock filter weight must be less than 2");
 
-  // Precompute the difference sqaured
+  // Precompute the difference squared
   for (row = 0; row < block_height; row++) {
     for (blk_col = 0; blk_col < block_width; blk_col += 16) {
       store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,

From 45fb0161b0bce849f2c38aba0777b702740ccc92 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 18 Apr 2022 18:56:49 -0700
Subject: [PATCH 258/926] vp9_alloccommon: add missing pointer checks

in vp9_free_ref_frame_buffers() and vp9_free_context_buffers(); pool and
free_mi may be NULL due to earlier allocation failures

Change-Id: I3bd26ea29b3aea6c58f33d5b7f5a280eb6250ec7
---
 vp9/common/vp9_alloccommon.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 5702dca718..faad657a08 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -73,6 +73,8 @@ static void free_seg_map(VP9_COMMON *cm) {
 void vp9_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
+  if (!pool) return;
+
   for (i = 0; i < FRAME_BUFFERS; ++i) {
     if (!pool->frame_bufs[i].released &&
         pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
@@ -100,7 +102,7 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) {
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
-  cm->free_mi(cm);
+  if (cm->free_mi) cm->free_mi(cm);
   free_seg_map(cm);
   vpx_free(cm->above_context);
   cm->above_context = NULL;

From 0ca5af7e24f5a8927016d6932a665acc762639a6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 18 Apr 2022 18:57:51 -0700
Subject: [PATCH 259/926] vp9_alloc_internal_frame_buffers: fix num buffers
 assignment

avoid setting num_internal_frame_buffers until the allocation is
checked, avoiding an invalid access in vp9_free_internal_frame_buffers()

Change-Id: I28a544a2553d62a6b5cb7c45bf10591caa4ebab6
---
 vp9/common/vp9_frame_buffers.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c
index a254e79d20..889b809e50 100644
--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c
@@ -14,14 +14,17 @@
 #include "vpx_mem/vpx_mem.h"
 
 int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
   assert(list != NULL);
   vp9_free_internal_frame_buffers(list);
 
-  list->num_internal_frame_buffers =
-      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
-  list->int_fb = (InternalFrameBuffer *)vpx_calloc(
-      list->num_internal_frame_buffers, sizeof(*list->int_fb));
-  return (list->int_fb == NULL);
+  list->int_fb =
+      (InternalFrameBuffer *)vpx_calloc(num_buffers, sizeof(*list->int_fb));
+  if (list->int_fb) {
+    list->num_internal_frame_buffers = num_buffers;
+    return 0;
+  }
+  return -1;
 }
 
 void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
@@ -35,6 +38,7 @@ void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
   }
   vpx_free(list->int_fb);
   list->int_fb = NULL;
+  list->num_internal_frame_buffers = 0;
 }
 
 int vp9_get_frame_buffer(void *cb_priv, size_t min_size,

From f1d42a92bbb98ab4481f85716339a96914369e6a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 18 Apr 2022 18:59:57 -0700
Subject: [PATCH 260/926] vp9_encoder: check context buffer allocations

previously the returns for alloc_context_buffers_ext() and
vp9_alloc_context_buffers() were ignored which would result in a NULL
access during encoding should they fail

Change-Id: Icd76576f3d5f8d57697adc9ae926a3a5be731327
---
 vp9/encoder/vp9_encoder.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 84ab80fe3f..ec6a756197 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1379,21 +1379,22 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #endif
 }
 
-static int alloc_context_buffers_ext(VP9_COMP *cpi) {
+static void alloc_context_buffers_ext(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int mi_size = cm->mi_cols * cm->mi_rows;
 
-  cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
-  if (!cpi->mbmi_ext_base) return 1;
-
-  return 0;
+  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+                  vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
 static void alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int sb_rows;
 
-  vp9_alloc_context_buffers(cm, cm->width, cm->height);
+  if (vp9_alloc_context_buffers(cm, cm->width, cm->height)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
 
   alloc_context_buffers_ext(cpi);
 

From 6ea4ef1d24f84d131e0a4398bf358bfd79bc88c3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 18 Apr 2022 19:07:39 -0700
Subject: [PATCH 261/926] vp9_dx_iface,init_buffer_callbacks: return on alloc
 failure

use an error code as a jmp target is not currently set in init_decoder()

Change-Id: If7798039439f13c739298a8a92a55aaa24e2210c
---
 vp9/vp9_dx_iface.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 35ecbaff37..3c42c7dfed 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -201,7 +201,7 @@ static vpx_codec_err_t update_error_state(
   return error->error_code;
 }
 
-static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
+static vpx_codec_err_t init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
   VP9_COMMON *const cm = &ctx->pbi->common;
   BufferPool *const pool = cm->buffer_pool;
 
@@ -217,12 +217,16 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
     pool->get_fb_cb = vp9_get_frame_buffer;
     pool->release_fb_cb = vp9_release_frame_buffer;
 
-    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) {
       vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                          "Failed to initialize internal frame buffers");
+      return VPX_CODEC_MEM_ERROR;
+    }
 
     pool->cb_priv = &pool->int_frame_buffers;
   }
+
+  return VPX_CODEC_OK;
 }
 
 static void set_default_ppflags(vp8_postproc_cfg_t *cfg) {
@@ -278,9 +282,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
   if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
     set_default_ppflags(&ctx->postproc_cfg);
 
-  init_buffer_callbacks(ctx);
-
-  return VPX_CODEC_OK;
+  return init_buffer_callbacks(ctx);
 }
 
 static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,

From 665f6a3065555317e75915af2561cf9b776e50b6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 18 Apr 2022 19:10:11 -0700
Subject: [PATCH 262/926] webmdec: fix double free

when no frames were decoded, for example due to a decoder initialization
failure, an orphan buffer pointer from webm_guess_framerate() via
webm_read_frame() would have been freed during cleanup

Change-Id: I6ea3defdd13dd75427f79c516e207b682391e4fa
---
 webmdec.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/webmdec.cc b/webmdec.cc
index 68c6f4782d..f7671bb641 100644
--- a/webmdec.cc
+++ b/webmdec.cc
@@ -210,6 +210,8 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
   vpx_ctx->framerate.denominator =
       static_cast<int>(webm_ctx->timestamp_ns / 1000);
   delete[] buffer;
+  // webm_ctx->buffer is assigned to the buffer pointer in webm_read_frame().
+  webm_ctx->buffer = nullptr;
 
   get_first_cluster(webm_ctx);
   webm_ctx->block = nullptr;

From 8da05d39b94ab09ca1993e5c1cb6430f0d3000db Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 19 Apr 2022 17:24:26 -0700
Subject: [PATCH 263/926] vp8_decode: free mt buffers early on resolution
 change

this avoids a desynchronization of mb_rows if an allocation prior to
vp8mt_alloc_temp_buffers() fails and the decoder is then destroyed

Bug: webm:1759
Change-Id: I75457ef9ceb24c8a8fd213c3690e7c1cf0ec425f
---
 vp8/vp8_dx_iface.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index ba0714abe5..6d88e5154f 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -371,8 +371,6 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
       pc->Width = ctx->si.w;
       pc->Height = ctx->si.h;
       {
-        int prev_mb_rows = pc->mb_rows;
-
         if (setjmp(pbi->common.error.jmp)) {
           pbi->common.error.setjmp = 0;
           /* on failure clear the cached resolution to ensure a full
@@ -398,6 +396,12 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                              "Invalid frame height");
         }
 
+#if CONFIG_MULTITHREAD
+        if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
+          vp8mt_de_alloc_temp_buffers(pbi, pc->mb_rows);
+        }
+#endif
+
         if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height)) {
           vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                              "Failed to allocate frame buffers");
@@ -442,10 +446,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
 #if CONFIG_MULTITHREAD
         if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
-          vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+          vp8mt_alloc_temp_buffers(pbi, pc->Width, 0);
         }
-#else
-        (void)prev_mb_rows;
 #endif
       }
 

From f2ef29f746c4cad7a41b3bba5daefba2726eda3a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 19 Apr 2022 19:26:37 -0700
Subject: [PATCH 264/926] fdct16x16_neon.h,cosmetics: fix include-guard case

Change-Id: I593735bb7f88d63f2ddab57484099479c8759a3d
---
 vpx_dsp/arm/fdct16x16_neon.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index 8391238991..0dd21153fc 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
-#define VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
+#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
 
 #include <arm_neon.h>
 
@@ -324,4 +324,4 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
                       &out[11]);
 }
 
-#endif  // VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
+#endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_

From 2651113a64d2a6892431b843ce35b57621369765 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Mon, 18 Apr 2022 16:17:17 +0800
Subject: [PATCH 265/926] vp9[loongarch]: Optimize vertical/horizontal_8_dual

1. vpx_lpf_vertical_8_dual_lsx
2. vpx_lpf_horizontal_8_dual_lsx

Bug: webm:1755

Change-Id: I354df02cc215f36b4edf6558af0ff7fd6909deac
---
 test/lpf_test.cc                     |  14 +-
 vpx_dsp/loongarch/loopfilter_8_lsx.c | 270 +++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |   4 +-
 3 files changed, 283 insertions(+), 5 deletions(-)

diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 833dfb9a89..0bdec77e54 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -147,7 +147,7 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param);
 
-#if HAVE_NEON || HAVE_SSE2 || \
+#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH) || \
     (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH)
 class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
  public:
@@ -169,7 +169,7 @@ class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param);
 #endif  // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA &&
-        // (!CONFIG_VP9_HIGHBITDEPTH))
+        // (!CONFIG_VP9_HIGHBITDEPTH) || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH))
 
 TEST_P(Loop8Test6Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -281,7 +281,7 @@ TEST_P(Loop8Test6Param, ValueCheck) {
       << "First failed at test case " << first_failure;
 }
 
-#if HAVE_NEON || HAVE_SSE2 || \
+#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)) || \
     (HAVE_DSPR2 || HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH))
 TEST_P(Loop8Test9Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -411,6 +411,7 @@ TEST_P(Loop8Test9Param, ValueCheck) {
       << "First failed at test case " << first_failure;
 }
 #endif  // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA &&
+        // (!CONFIG_VP9_HIGHBITDEPTH)) || (HAVE_LSX &&
         // (!CONFIG_VP9_HIGHBITDEPTH))
 
 using std::make_tuple;
@@ -702,6 +703,13 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c,
                    8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_lsx,
+                                 &vpx_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_8_dual_lsx,
+                                 &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
 
 }  // namespace
diff --git a/vpx_dsp/loongarch/loopfilter_8_lsx.c b/vpx_dsp/loongarch/loopfilter_8_lsx.c
index facf6f30ec..358e221662 100644
--- a/vpx_dsp/loongarch/loopfilter_8_lsx.c
+++ b/vpx_dsp/loongarch/loopfilter_8_lsx.c
@@ -83,6 +83,93 @@ void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
   }
 }
 
+void vpx_lpf_horizontal_8_dual_lsx(
+    uint8_t *dst, int32_t stride, const uint8_t *b_limit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vreplgr2vr_b(*thresh0);
+  tmp = __lsx_vreplgr2vr_b(*thresh1);
+  thresh = __lsx_vilvl_d(tmp, thresh);
+
+  b_limit = __lsx_vreplgr2vr_b(*b_limit0);
+  tmp = __lsx_vreplgr2vr_b(*b_limit1);
+  b_limit = __lsx_vilvl_d(tmp, b_limit);
+
+  limit = __lsx_vreplgr2vr_b(*limit0);
+  tmp = __lsx_vreplgr2vr_b(*limit1);
+  limit = __lsx_vilvl_d(tmp, limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vst(p1_out, dst - stride2, 0);
+    __lsx_vst(p0_out, dst - stride, 0);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vst(q1_out, dst + stride, 0);
+  } else {
+    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+              p1_l, p0_l);
+    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+              q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h,
+              p1_h, p0_h);
+    DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h,
+              q2_h, q3_h);
+    VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+                p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+              p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+
+    /* store pixel values */
+    p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    __lsx_vst(p2_out, dst - stride3, 0);
+    __lsx_vst(p1_out, dst - stride2, 0);
+    __lsx_vst(p0_out, dst - stride, 0);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vst(q1_out, dst + stride, 0);
+    __lsx_vst(q2_out, dst + stride2, 0);
+  }
+}
+
 void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
@@ -197,3 +284,186 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
     __lsx_vstelm_h(vec4, dst, 4, 7);
   }
 }
+
+void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
+                                 const uint8_t *b_limit0, const uint8_t *limit0,
+                                 const uint8_t *thresh0,
+                                 const uint8_t *b_limit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  uint8_t *dst_tmp = dst - 4;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p1_out, p0_out, q0_out, q1_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i row4, row5, row6, row7, row12, row13, row14, row15;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i zero = __lsx_vldi(0);
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  p0 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
+  p3 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  row4 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
+  row7 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+
+  q3 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
+  q0 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  row12 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
+  row15 = __lsx_vldx(dst_tmp, stride3);
+
+  /* transpose 16x8 matrix into 8x16 */
+  LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+                      q3);
+
+  thresh = __lsx_vreplgr2vr_b(*thresh0);
+  vec0 = __lsx_vreplgr2vr_b(*thresh1);
+  thresh = __lsx_vilvl_d(vec0, thresh);
+
+  b_limit = __lsx_vreplgr2vr_b(*b_limit0);
+  vec0 = __lsx_vreplgr2vr_b(*b_limit1);
+  b_limit = __lsx_vilvl_d(vec0, b_limit);
+
+  limit = __lsx_vreplgr2vr_b(*limit0);
+  vec0 = __lsx_vreplgr2vr_b(*limit1);
+  limit = __lsx_vilvl_d(vec0, limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec2 = __lsx_vilvl_h(vec1, vec0);
+    vec3 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec4 = __lsx_vilvl_h(vec1, vec0);
+    vec5 = __lsx_vilvh_h(vec1, vec0);
+
+    dst -= 2;
+    __lsx_vstelm_w(vec2, dst, 0, 0);
+    __lsx_vstelm_w(vec2, dst + stride, 0, 1);
+    __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(vec3, dst, 0, 0);
+    __lsx_vstelm_w(vec3, dst + stride, 0, 1);
+    __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(vec4, dst, 0, 0);
+    __lsx_vstelm_w(vec4, dst + stride, 0, 1);
+    __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
+    __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(vec5, dst, 0, 0);
+    __lsx_vstelm_w(vec5, dst + stride, 0, 1);
+    __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
+    __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
+  } else {
+    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+              p1_l, p0_l);
+    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+              q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+    DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h,
+              p1_h, p0_h);
+    DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h,
+              q2_h, q3_h);
+
+    /* filter8 */
+    VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+                p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+              p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+
+    /* store pixel values */
+    p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
+    vec3 = __lsx_vilvl_h(vec1, vec0);
+    vec4 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
+    vec6 = __lsx_vilvl_h(vec1, vec0);
+    vec7 = __lsx_vilvh_h(vec1, vec0);
+    vec2 = __lsx_vilvl_b(q2, q1);
+    vec5 = __lsx_vilvh_b(q2, q1);
+
+    dst -= 3;
+    __lsx_vstelm_w(vec3, dst, 0, 0);
+    __lsx_vstelm_h(vec2, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(vec3, dst, 0, 1);
+    __lsx_vstelm_h(vec2, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(vec3, dst, 0, 2);
+    __lsx_vstelm_h(vec2, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(vec3, dst, 0, 3);
+    __lsx_vstelm_h(vec2, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(vec4, dst, 0, 0);
+    __lsx_vstelm_h(vec2, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(vec4, dst, 0, 1);
+    __lsx_vstelm_h(vec2, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(vec4, dst, 0, 2);
+    __lsx_vstelm_h(vec2, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(vec4, dst, 0, 3);
+    __lsx_vstelm_h(vec2, dst, 4, 7);
+    dst += stride;
+    __lsx_vstelm_w(vec6, dst, 0, 0);
+    __lsx_vstelm_h(vec5, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(vec6, dst, 0, 1);
+    __lsx_vstelm_h(vec5, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(vec6, dst, 0, 2);
+    __lsx_vstelm_h(vec5, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(vec6, dst, 0, 3);
+    __lsx_vstelm_h(vec5, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(vec7, dst, 0, 0);
+    __lsx_vstelm_h(vec5, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(vec7, dst, 0, 1);
+    __lsx_vstelm_h(vec5, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(vec7, dst, 0, 2);
+    __lsx_vstelm_h(vec5, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(vec7, dst, 0, 3);
+    __lsx_vstelm_h(vec5, dst, 4, 7);
+  }
+}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 7d78dc72ac..d10f3a1408 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -448,7 +448,7 @@ ()
 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
@@ -466,7 +466,7 @@ ()
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;

From 608a28e30b7abc62ed415af3dbb3d981e22b8a1c Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Mon, 18 Apr 2022 16:21:04 +0800
Subject: [PATCH 266/926] vp9[loongarch]: Optimize
 convolve8_avg_vert/convolve_copy

1. vpx_convolve8_avg_vert_lsx
2. vpx_convolve_copy_lsx
3. vpx_idct32x32_135_add_lsx

Bug: webm:1755

Change-Id: I6bdfe5836a91a5e361ab869b26641e86c5ebb68d
---
 test/convolve_test.cc                         |   4 +-
 .../loongarch/vpx_convolve8_avg_vert_lsx.c    | 918 ++++++++++++++++++
 vpx_dsp/loongarch/vpx_convolve_copy_lsx.c     | 438 +++++++++
 vpx_dsp/vpx_dsp.mk                            |   2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl                  |   5 +-
 5 files changed, 1363 insertions(+), 4 deletions(-)
 create mode 100644 vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
 create mode 100644 vpx_dsp/loongarch/vpx_convolve_copy_lsx.c

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 5189be647a..d569048691 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1451,9 +1451,9 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest,
 
 #if HAVE_LSX
 const ConvolveFunctions convolve8_lsx(
-    vpx_convolve_copy_c, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx,
+    vpx_convolve_copy_lsx, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx,
     vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx,
-    vpx_convolve8_avg_vert_c, vpx_convolve8_lsx, vpx_convolve8_avg_lsx,
+    vpx_convolve8_avg_vert_lsx, vpx_convolve8_lsx, vpx_convolve8_avg_lsx,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
     vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
new file mode 100644
index 0000000000..584f241838
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
@@ -0,0 +1,918 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i reg0, reg1, reg2, reg3, reg4;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  src0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+            src2);
+  src3 = __lsx_vldx(src_tmp0, src_stride3);
+  src_tmp0 += src_stride4;
+  src4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+            src6);
+  src_tmp0 += src_stride3;
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+  reg2 = __lsx_vilvl_d(tmp5, tmp2);
+  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+  reg2 = __lsx_vxori_b(reg2, 128);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src_tmp0, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+              src9);
+    src10 = __lsx_vldx(src_tmp0, src_stride3);
+    src_tmp0 += src_stride4;
+    src0 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src1 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src2 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
+    src0 = __lsx_vilvl_d(src1, src0);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+    DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+    out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
+                               filter2, filter3);
+    out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
+                               filter2, filter3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    out0 = __lsx_vavgr_bu(out0, src0);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+    reg0 = reg2;
+    reg1 = reg3;
+    reg2 = reg4;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+            src2);
+  src3 = __lsx_vldx(src_tmp0, src_stride3);
+  src_tmp0 += src_stride4;
+  src4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+            src6);
+  src_tmp0 += src_stride3;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src_tmp0, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+              src9);
+    src10 = __lsx_vldx(src_tmp0, src_stride3);
+    src_tmp0 += src_stride4;
+    src0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
+                               filter2, filter3);
+    out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
+                               filter2, filter3);
+    out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
+                               filter2, filter3);
+    out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+    reg0 = reg2;
+    reg1 = tmp0;
+    reg2 = tmp2;
+    reg3 = reg5;
+    reg4 = tmp1;
+    reg5 = tmp3;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter, int32_t height, int32_t width) {
+  uint8_t *src_tmp;
+  uint32_t cnt = width >> 4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; cnt--;) {
+    uint32_t loop_cnt = height >> 2;
+    uint8_t *dst_reg = dst;
+
+    src_tmp = src_tmp0;
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_tmp, src_stride3);
+    src_tmp += src_stride4;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+              src6);
+    src_tmp += src_stride3;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+    src6 = __lsx_vxori_b(src6, 128);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg6, reg7, reg8, reg9);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+    for (; loop_cnt--;) {
+      src7 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+                src9);
+      src10 = __lsx_vldx(src_tmp, src_stride3);
+      src_tmp += src_stride4;
+      DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                src7, src8, src9, src10);
+      DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src0, src1, src2, src3);
+      DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src4, src5, src7, src8);
+      tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      tmp2 = __lsx_vld(dst_reg, 0);
+      tmp3 = __lsx_vldx(dst_reg, dst_stride);
+      DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+      __lsx_vst(tmp0, dst_reg, 0);
+      __lsx_vstx(tmp1, dst_reg, dst_stride);
+      tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      tmp2 = __lsx_vldx(dst_reg, dst_stride2);
+      tmp3 = __lsx_vldx(dst_reg, dst_stride3);
+      DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+      __lsx_vstx(tmp0, dst_reg, dst_stride2);
+      __lsx_vstx(tmp1, dst_reg, dst_stride3);
+      dst_reg += dst_stride4;
+
+      reg0 = reg2;
+      reg1 = src0;
+      reg2 = src2;
+      reg3 = reg5;
+      reg4 = src1;
+      reg5 = src3;
+      reg6 = reg8;
+      reg7 = src4;
+      reg8 = src7;
+      reg9 = reg11;
+      reg10 = src5;
+      reg11 = src8;
+      src6 = src10;
+    }
+    src_tmp0 += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  __m128i src10_r, src32_r, src21_r, src43_r;
+  __m128i tmp0, tmp1;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+  out = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 3);
+  dst += dst_stride;
+}
+
+static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src2110, src4332, src6554, src8776, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src7 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src8 = __lsx_vld(src, 0);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst1 = __lsx_vilvl_w(dst2, dst1);
+  dst2 = __lsx_vilvl_w(dst4, dst3);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+            src54_r, src65_r, src76_r, src87_r);
+  DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+            src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
+            src8776, filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(tmp2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 3);
+}
+
+static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1);
+  DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp2, dst, 0, 1);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst4 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst5 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3);
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  uint8_t *src_tmp1;
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+
+    src_tmp1 = src + 16;
+    src6 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7,
+              src8);
+    src9 = __lsx_vldx(src_tmp1, src_stride3);
+
+    dst_tmp1 = dst + 16;
+    dst4 = __lsx_vld(dst_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5,
+              dst6);
+    dst7 = __lsx_vldx(dst_tmp1, dst_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vstx(tmp0, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vstx(tmp0, dst, dst_stride2);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vstx(tmp0, dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+    dst += dst_stride;
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  int32_t src_stride2 = src_stride << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *src_tmp1;
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i src6, src7, src8, src9, src10, src11, filt0;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+            src9);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src2 = __lsx_vldx(src, src_stride);
+    dst1 = __lsx_vldx(dst, dst_stride);
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+              src10);
+    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4,
+              dst6);
+    src_tmp1 = (uint8_t *)src + 16;
+    src5 = __lsx_vldx(src_tmp1, src_stride);
+    src_tmp1 = src_tmp1 + 16;
+    src8 = __lsx_vldx(src_tmp1, src_stride);
+    src_tmp1 = src_tmp1 + 16;
+    src11 = __lsx_vldx(src_tmp1, src_stride);
+
+    dst_tmp1 = dst + 16;
+    dst3 = __lsx_vldx(dst_tmp1, dst_stride);
+    dst_tmp1 = dst + 32;
+    dst5 = __lsx_vldx(dst_tmp1, dst_stride);
+    dst_tmp1 = dst + 48;
+    dst7 = __lsx_vldx(dst_tmp1, dst_stride);
+    src += src_stride2;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vstx(tmp0, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vst(tmp0, dst, 16);
+
+    dst_tmp1 = dst + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+    __lsx_vst(tmp0, dst, 32);
+
+    dst_tmp1 = dst_tmp1 + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+    __lsx_vst(tmp0, dst, 48);
+
+    dst_tmp1 = dst_tmp1 + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+    dst += dst_stride2;
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+
+        break;
+      case 32:
+        common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
new file mode 100644
index 0000000000..398788a43e
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
@@ -0,0 +1,438 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+
+      __lsx_vstelm_d(src4, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src5, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src6, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src7, dst, 0, 0);
+      dst += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    for (cnt = height >> 3; cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+
+      __lsx_vstelm_d(src4, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src5, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src6, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src7, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 4) == 0) {
+    for (cnt = (height / 4); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 2) == 0) {
+    for (cnt = (height / 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      src1 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = (uint8_t *)src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      src0 = __lsx_vld(src_tmp, 0);
+      DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp,
+                src_stride3, src_tmp, src_stride4, src1, src2, src3, src4);
+      src_tmp += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src_tmp += src_stride2;
+      src7 = __lsx_vldx(src_tmp, src_stride);
+      src_tmp += src_stride2;
+
+      __lsx_vst(src0, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src1, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src2, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src3, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src4, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src5, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src6, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src7, dst, 0);
+      dst += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16);
+  } else if ((height % 4) == 0) {
+    for (cnt = (height >> 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32);
+  } else if ((height % 4) == 0) {
+    for (cnt = (height >> 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+  }
+}
+
+static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                           int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt;
+      __m128i tmp;
+      for (cnt = h; cnt--;) {
+        tmp = __lsx_vldrepl_w(src, 0);
+        __lsx_vstelm_w(tmp, dst, 0, 0);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 3eba23c0af..01886b8821 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -165,11 +165,13 @@ DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
 
 # common (lsx)
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_vert_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_copy_lsx.c
 DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h
 
 # loop filters
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d10f3a1408..e5617eea31 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -368,7 +368,7 @@ ()
 # Sub Pixel Filters
 #
 add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
+specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx lsx/;
 
 add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/;
@@ -389,7 +389,7 @@ ()
 specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3 neon msa/;
@@ -656,6 +656,7 @@ ()
     specialize qw/vpx_idct32x32_135_add dspr2 msa/;
     $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
     $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
+    $vpx_idct32x32_135_add_lsx=vpx_idct32x32_1024_add_lsx;
     specialize qw/vpx_idct32x32_34_add dspr2 msa lsx/;
     specialize qw/vpx_idct32x32_1_add dspr2 msa lsx/;
     specialize qw/vpx_iwht4x4_16_add msa/;

From 618739f59f5d3505ff76a1a82eb198bed4ec989d Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Sun, 3 Apr 2022 18:49:52 +0800
Subject: [PATCH 267/926] vp9[loongarch]: Optimize horizontal/vertical_4/dual

1. vpx_lpf_horizontal_4_lsx
2. vpx_lpf_vertical_4_lsx
3. vpx_lpf_horizontal_4_dual_lsx
3. vpx_lpf_vertical_4_dual_lsx

Bug: webm:1755

Change-Id: I12e9f27cafd9514b24cfbf2354cc66c7d1238687
---
 test/lpf_test.cc                     |   8 +-
 vpx_dsp/loongarch/loopfilter_4_lsx.c | 214 +++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk                   |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |   8 +-
 4 files changed, 226 insertions(+), 5 deletions(-)
 create mode 100644 vpx_dsp/loongarch/loopfilter_4_lsx.c

diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 0bdec77e54..4cc99a6db4 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -697,17 +697,23 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     LSX, Loop8Test6Param,
     ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_lsx, &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_16_dual_lsx,
                    &vpx_lpf_horizontal_16_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_lsx, &vpx_lpf_vertical_4_c, 8),
         make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c,
                    8)));
 
 INSTANTIATE_TEST_SUITE_P(
     LSX, Loop8Test9Param,
-    ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_lsx,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_lsx,
+                                 &vpx_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_horizontal_8_dual_lsx,
                                  &vpx_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_4_dual_lsx,
+                                 &vpx_lpf_vertical_4_dual_c, 8),
                       make_tuple(&vpx_lpf_vertical_8_dual_lsx,
                                  &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
diff --git a/vpx_dsp/loongarch/loopfilter_4_lsx.c b/vpx_dsp/loongarch/loopfilter_4_lsx.c
new file mode 100644
index 0000000000..e8abf0523f
--- /dev/null
+++ b/vpx_dsp/loongarch/loopfilter_4_lsx.c
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, thresh, b_limit, limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+            p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+  q3 = __lsx_vldx(src, pitch3);
+
+  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
+  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
+  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  __lsx_vstelm_d(p1_out, src - pitch2, 0, 0);
+  __lsx_vstelm_d(p0_out, src - pitch, 0, 0);
+  __lsx_vstelm_d(q0_out, src, 0, 0);
+  __lsx_vstelm_d(q1_out, src + pitch, 0, 0);
+}
+
+void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0_ptr,
+                                   const uint8_t *limit0_ptr,
+                                   const uint8_t *thresh0_ptr,
+                                   const uint8_t *b_limit1_ptr,
+                                   const uint8_t *limit1_ptr,
+                                   const uint8_t *thresh1_ptr) {
+  __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+            p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+  q3 = __lsx_vldx(src, pitch3);
+
+  thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
+  thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr);
+  b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vreplgr2vr_b(*limit0_ptr);
+  limit1 = __lsx_vreplgr2vr_b(*limit1_ptr);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+  __lsx_vstx(p1, src, -pitch2);
+  __lsx_vstx(p0, src, -pitch);
+  __lsx_vst(q0, src, 0);
+  __lsx_vstx(q1, src, pitch);
+}
+
+void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, limit, thresh, b_limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+  uint8_t *src_tmp = src - 4;
+
+  p3 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1);
+  p0 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  q0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2);
+  q3 = __lsx_vldx(src_tmp, pitch3);
+
+  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
+  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
+  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+
+  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
+  vec2 = __lsx_vilvl_h(vec1, vec0);
+  vec3 = __lsx_vilvh_h(vec1, vec0);
+
+  src -= 2;
+  __lsx_vstelm_w(vec2, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(vec3, src, 0, 0);
+  __lsx_vstelm_w(vec3, src + pitch, 0, 1);
+  __lsx_vstelm_w(vec3, src + pitch2, 0, 2);
+  __lsx_vstelm_w(vec3, src + pitch3, 0, 3);
+}
+
+void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0_ptr,
+                                 const uint8_t *limit0_ptr,
+                                 const uint8_t *thresh0_ptr,
+                                 const uint8_t *b_limit1_ptr,
+                                 const uint8_t *limit1_ptr,
+                                 const uint8_t *thresh1_ptr) {
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+  uint8_t *src_tmp = src - 4;
+
+  row0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2);
+  row3 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6);
+  row7 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row8 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10);
+  row11 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row12 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14);
+  row15 = __lsx_vldx(src_tmp, pitch3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
+  thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr);
+  b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vreplgr2vr_b(*limit0_ptr);
+  limit1 = __lsx_vreplgr2vr_b(*limit1_ptr);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src -= 2;
+  __lsx_vstelm_w(tmp2, src, 0, 0);
+  __lsx_vstelm_w(tmp2, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp2, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp2, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp3, src, 0, 0);
+  __lsx_vstelm_w(tmp3, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp3, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp3, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp4, src, 0, 0);
+  __lsx_vstelm_w(tmp4, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp4, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp4, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp5, src, 0, 0);
+  __lsx_vstelm_w(tmp5, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp5, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp5, src + pitch3, 0, 3);
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 01886b8821..ec0c598031 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -203,6 +203,7 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_lsx.h
 DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_16_lsx.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_8_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_4_lsx.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e5617eea31..76b00e136c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -451,10 +451,10 @@ ()
 specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/;
@@ -469,10 +469,10 @@ ()
 specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa lsx/;
 } #CONFIG_VP9
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

From 76b7350cee4a4f047c813134dba33594d0b2785b Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Fri, 8 Apr 2022 15:00:33 +0800
Subject: [PATCH 268/926] vp9[loongarch]: Optimize
 sub_pixel_variance32x32/sad16x16

1. vpx_sad16x16_lsx
2. vpx_sub_pixel_variance32x32_lsx

Bug: webm:1755

Change-Id: I9926ace710903993ccbb42caef320fa895e90127
---
 test/sad_test.cc                           |   1 +
 test/variance_test.cc                      |   3 +
 vpx_dsp/loongarch/sad_lsx.c                |  35 ++-
 vpx_dsp/loongarch/sub_pixel_variance_lsx.c | 348 +++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk                         |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl               |   4 +-
 6 files changed, 389 insertions(+), 3 deletions(-)
 create mode 100644 vpx_dsp/loongarch/sub_pixel_variance_lsx.c

diff --git a/test/sad_test.cc b/test/sad_test.cc
index aec4cbc380..e4952ba9f7 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1135,6 +1135,7 @@ INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
 const SadMxNParam lsx_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_lsx),
   SadMxNParam(32, 32, &vpx_sad32x32_lsx),
+  SadMxNParam(16, 16, &vpx_sad16x16_lsx),
 };
 INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests));
 
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 8060875197..6872ca2710 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1655,5 +1655,8 @@ INSTANTIATE_TEST_SUITE_P(
     LSX, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx),
                       VarianceParams(5, 5, &vpx_variance32x32_lsx)));
+INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelVarianceTest,
+                         ::testing::Values(SubpelVarianceParams(
+                             5, 5, &vpx_sub_pixel_variance32x32_lsx, 0)));
 #endif
 }  // namespace
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
index 59b268ca1f..cd3f2d46bb 100644
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -46,6 +46,33 @@
     sum_m;                              \
   })
 
+static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 2);
+  __m128i src0, src1, ref0, ref1, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+  int32_t src_stride2 = src_stride << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+    src += src_stride2;
+    ref += ref_stride2;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+    src += src_stride2;
+    ref += ref_stride2;
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  return HADD_UH_U32(sad);
+}
+
 static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
                                 const uint8_t *ref, int32_t ref_stride,
                                 int32_t height) {
@@ -328,6 +355,12 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
   sad_array[3] = HADD_UW_U32(sad);
 }
 
+#define VPX_SAD_16xHEIGHT_LSX(height)                                         \
+  uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_16width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
 #define VPX_SAD_32xHT_LSX(height)                                             \
   uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
                                     const uint8_t *ref, int32_t ref_stride) { \
@@ -369,7 +402,7 @@ SAD64
 
 SAD32
 
-#define SAD16 VPX_SAD_16xHTx4D_LSX(16)
+#define SAD16 VPX_SAD_16xHEIGHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
 
 SAD16
 
diff --git a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
new file mode 100644
index 0000000000..0a0486479a
--- /dev/null
+++ b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
@@ -0,0 +1,348 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vpx_dsp/variance.h"
+
+#define HADD_SW_S32(in0, in1)                  \
+  do {                                         \
+    __m128i res0_m;                            \
+                                               \
+    res0_m = __lsx_vhaddw_d_w(in0, in0);       \
+    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+    in1 = __lsx_vpickve2gr_w(res0_m, 0);       \
+  } while (0)
+
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \
+  do {                                                        \
+    __m128i tmp0_m, tmp1_m;                                   \
+                                                              \
+    tmp0_m = __lsx_vshuf_b(in1, in0, mask);                   \
+    tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);                  \
+    in2 = __lsx_vsrari_h(tmp1_m, shift);                      \
+  } while (0)
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                                \
+  {                                                                       \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+    sub = __lsx_vadd_h(sub, res_l0_m);                                    \
+    sub = __lsx_vadd_h(sub, res_l1_m);                                    \
+  }
+
+static const uint8_t bilinear_filters_lsx[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t sub_pixel_sse_diff_16width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i vec, var = __lsx_vldi(0);
+  __m128i avg = var;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+  __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i var = __lsx_vldi(0);
+  __m128i avg = var;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
+  __m128i var = __lsx_vldi(0);
+  __m128i avg = var;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+  HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height,
+                                           &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height,
+                                           &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht)                              \
+  uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx(                           \
+      const uint8_t *src, int32_t src_stride, int32_t x_offset,               \
+      int32_t y_offset, const uint8_t *ref, int32_t ref_stride,               \
+      uint32_t *sse) {                                                        \
+    int32_t diff;                                                             \
+    uint32_t var;                                                             \
+    const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
+                                                                              \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_sse_diff_##wd##width_hv_lsx(                         \
+            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+      } else {                                                                \
+        *sse = sub_pixel_sse_diff_##wd##width_v_lsx(                          \
+            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
+      }                                                                       \
+                                                                              \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
+    } else {                                                                  \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_sse_diff_##wd##width_h_lsx(                          \
+            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
+                                                                              \
+        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
+      } else {                                                                \
+        var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return var;                                                               \
+  }
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index ec0c598031..5c3ffe97d4 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -399,6 +399,7 @@ DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
 DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/sub_pixel_variance_lsx.c
 
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 76b00e136c..932099243f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -754,7 +754,7 @@ ()
 specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
@@ -1162,7 +1162,7 @@ ()
   specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;

From 192c85c4312f84eefc2bcc92b7fa7e8a685c5700 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 25 Apr 2022 11:20:48 -0700
Subject: [PATCH 269/926] add_noise_test.cc: remove stale TODO

this was completed in:
0dc69c70f postproc : fix function parameters for noise functions.

Change-Id: I84f789ca333e9690e70e696d44475dd59339593b
---
 test/add_noise_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
index 25de4279c2..7dc86e3eb6 100644
--- a/test/add_noise_test.cc
+++ b/test/add_noise_test.cc
@@ -23,7 +23,6 @@ namespace {
 
 static const int kNoiseSize = 3072;
 
-// TODO(jimbankoski): make width and height integers not unsigned.
 typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise,
                              int blackclamp, int whiteclamp, int width,
                              int height, int pitch);

From d18407a171ef7a0108f961c12794ddb32ad5c9ab Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 25 Apr 2022 15:12:02 -0700
Subject: [PATCH 270/926] register_state_check.h: add compiler barrier

around ASM_REGISTER_STATE_CHECK() this helps keep the call ordering
consistent avoiding some code reordering which may affect the registers
being checked

fixes issue with armv7 and multiple versions of gcc:
[ RUN      ] C/AddNoiseTest.CheckNoiseAdded/0
test/register_state_check.h:116: Failure
Expected equality of these values:
  pre_store_[i]
    Which is: 0
  post_store[i]
    Which is: 4618441417868443648

Bug: webm:1760
Change-Id: Ib8bcefd2c4d263f9fc4d4b4d4ffb853fe89d1152
Fixed: webm:1760
---
 test/register_state_check.h | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/test/register_state_check.h b/test/register_state_check.h
index 4366466378..1746240c61 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -35,6 +35,7 @@
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
+#include <intrin.h>
 #include <windows.h>
 #include <winnt.h>
 
@@ -81,10 +82,13 @@ class RegisterStateCheck {
   CONTEXT pre_context_;
 };
 
-#define ASM_REGISTER_STATE_CHECK(statement)    \
-  do {                                         \
-    libvpx_test::RegisterStateCheck reg_check; \
-    statement;                                 \
+#define ASM_REGISTER_STATE_CHECK(statement)      \
+  do {                                           \
+    {                                            \
+      libvpx_test::RegisterStateCheck reg_check; \
+      statement;                                 \
+    }                                            \
+    _ReadWriteBarrier();                         \
   } while (false)
 
 }  // namespace libvpx_test
@@ -121,11 +125,22 @@ class RegisterStateCheck {
   int64_t pre_store_[8];
 };
 
+#if defined(__GNUC__)
+#define ASM_REGISTER_STATE_CHECK(statement)      \
+  do {                                           \
+    {                                            \
+      libvpx_test::RegisterStateCheck reg_check; \
+      statement;                                 \
+    }                                            \
+    __asm__ volatile("" ::: "memory");           \
+  } while (false)
+#else
 #define ASM_REGISTER_STATE_CHECK(statement)    \
   do {                                         \
     libvpx_test::RegisterStateCheck reg_check; \
     statement;                                 \
   } while (false)
+#endif
 
 }  // namespace libvpx_test
 
@@ -169,10 +184,13 @@ class RegisterStateCheckMMX {
   uint16_t pre_fpu_env_[14];
 };
 
-#define API_REGISTER_STATE_CHECK(statement)       \
-  do {                                            \
-    libvpx_test::RegisterStateCheckMMX reg_check; \
-    ASM_REGISTER_STATE_CHECK(statement);          \
+#define API_REGISTER_STATE_CHECK(statement)         \
+  do {                                              \
+    {                                               \
+      libvpx_test::RegisterStateCheckMMX reg_check; \
+      ASM_REGISTER_STATE_CHECK(statement);          \
+    }                                               \
+    __asm__ volatile("" ::: "memory");              \
   } while (false)
 
 }  // namespace libvpx_test

From f6de5b51b8338ebd743a465e84d2c4b73cc29082 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 5 Apr 2022 18:17:19 +0800
Subject: [PATCH 271/926] vp9[loongarch]: Optimize fdct/get/variance16x16

1. vpx_fdct16x16_lsx
2. vpx_get16x16var_lsx
3. vpx_variance16x16_lsx

Bug: webm:1755

Change-Id: I27090406dc28cfdca64760fea4bc16ae11b74628
---
 test/dct16x16_test.cc               |   7 +
 test/dct_test.cc                    |  16 ++
 test/variance_test.cc               |   4 +-
 vpx_dsp/loongarch/fwd_txfm_lsx.c    | 258 ++++++++++++++++++++++++++++
 vpx_dsp/loongarch/fwd_txfm_lsx.h    | 166 ++++++++++++++++++
 vpx_dsp/loongarch/txfm_macros_lsx.h |   8 +
 vpx_dsp/loongarch/variance_lsx.c    |  54 +++++-
 vpx_dsp/vpx_dsp.mk                  |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl        |   6 +-
 9 files changed, 514 insertions(+), 6 deletions(-)
 create mode 100644 vpx_dsp/loongarch/fwd_txfm_lsx.c

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index c04880ec95..06837d809d 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -868,4 +868,11 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_vsx,
                                  0, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, Trans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct16x16_lsx,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      0, VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/dct_test.cc b/test/dct_test.cc
index 20e081a24c..6178f8e2cf 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -586,6 +586,21 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT,
                                                       VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_lsx_func_info[2] = {
+  { &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
+    16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
+    32, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, TransDCT,
+    ::testing::Combine(::testing::Range(0, 2),
+                       ::testing::Values(dct_lsx_func_info),
+                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !CONFIG_EMULATE_HARDWARE
 
 /* -------------------------------------------------------------------------- */
@@ -756,4 +771,5 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransWHT,
                          ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0,
                                                       VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_EMULATE_HARDWARE
+
 }  // namespace
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 6872ca2710..11983bb8ac 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1654,7 +1654,9 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     LSX, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx),
-                      VarianceParams(5, 5, &vpx_variance32x32_lsx)));
+                      VarianceParams(5, 5, &vpx_variance32x32_lsx),
+                      VarianceParams(4, 4, &vpx_variance16x16_lsx)));
+
 INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelVarianceTest,
                          ::testing::Values(SubpelVarianceParams(
                              5, 5, &vpx_sub_pixel_variance32x32_lsx, 0)));
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c
new file mode 100644
index 0000000000..03f194b433
--- /dev/null
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -0,0 +1,258 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride) {
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+  __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+  __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
+  __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
+  __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+  int32_t src_stride8 = src_stride4 << 1;
+  int16_t *input_tmp = (int16_t *)input;
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
+  input_tmp += src_stride4;
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
+  input_tmp += src_stride4;
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
+            in12);
+  input_tmp += src_stride4;
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
+            in14);
+  input_tmp += src_stride2;
+  in15 = __lsx_vldx(input_tmp, src_stride2);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+            in15);
+  DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
+            tmp6, tmp7);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  __lsx_vst(tmp0, tmp_ptr, 0);
+  __lsx_vst(tmp1, tmp_ptr, 64);
+  __lsx_vst(tmp2, tmp_ptr, 128);
+  __lsx_vst(tmp3, tmp_ptr, 192);
+  __lsx_vst(tmp4, tmp_ptr, 256);
+  __lsx_vst(tmp5, tmp_ptr, 320);
+  __lsx_vst(tmp6, tmp_ptr, 384);
+  __lsx_vst(tmp7, tmp_ptr, 448);
+  DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
+            in14, in13, in12);
+  DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
+            in9, in8);
+
+  tmp_ptr += 16;
+
+  /* stp 1 */
+  DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
+  DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
+
+  cnst4 = __lsx_vreplvei_h(coeff, 0);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
+
+  cnst5 = __lsx_vreplvei_h(coeff, 1);
+  cnst5 = __lsx_vpackev_h(cnst5, cnst4);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
+
+  /* stp2 */
+  LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+  LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+  DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
+  DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
+
+  cnst0 = __lsx_vreplvei_h(coeff, 4);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
+
+  LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+  vec1 = __lsx_vilvl_h(in15, in8);
+  vec0 = __lsx_vilvh_h(in15, in8);
+
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 0);
+
+  cnst0 = __lsx_vreplvei_h(coeff2, 0);
+  cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 448);
+
+  vec1 = __lsx_vilvl_h(in14, in9);
+  vec0 = __lsx_vilvh_h(in14, in9);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+  __lsx_vst(in8, tmp_ptr, 256);
+
+  cnst1 = __lsx_vreplvei_h(coeff2, 2);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 192);
+
+  DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
+
+  cnst1 = __lsx_vreplvei_h(coeff, 3);
+  cnst1 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
+
+  /* stp4 */
+  DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
+
+  vec1 = __lsx_vilvl_h(in13, in10);
+  vec0 = __lsx_vilvh_h(in13, in10);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 128);
+
+  cnst0 = __lsx_vreplvei_h(coeff2, 1);
+  cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 320);
+
+  DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
+  vec1 = __lsx_vilvl_h(in12, in11);
+  vec0 = __lsx_vilvh_h(in12, in11);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+  __lsx_vst(in8, tmp_ptr, 384);
+
+  cnst1 = __lsx_vreplvei_h(coeff2, 3);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 64);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  int16_t *input_tmp = input;
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
+            112, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
+            input_tmp, 240, in12, in13, in14, in15);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
+            in14, in15);
+
+  DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+            in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
+                     tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+  __lsx_vst(in8, input, 0);
+  __lsx_vst(in9, input, 32);
+  __lsx_vst(in10, input, 64);
+  __lsx_vst(in11, input, 96);
+  __lsx_vst(in12, input, 128);
+  __lsx_vst(in13, input, 160);
+  __lsx_vst(in14, input, 192);
+  __lsx_vst(in15, input, 224);
+
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
+            in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+               in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+                     tmp1, in1, tmp2, in2, tmp3, in3);
+  __lsx_vst(tmp0, output, 0);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(tmp1, output, 64);
+  __lsx_vst(in1, output, 96);
+  __lsx_vst(tmp2, output, 128);
+  __lsx_vst(in2, output, 160);
+  __lsx_vst(tmp3, output, 192);
+  __lsx_vst(in3, output, 224);
+
+  LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+                     tmp5, in5, tmp6, in6, tmp7, in7);
+  __lsx_vst(tmp4, output, 16);
+  __lsx_vst(in4, output, 48);
+  __lsx_vst(tmp5, output, 80);
+  __lsx_vst(in5, output, 112);
+  __lsx_vst(tmp6, output, 144);
+  __lsx_vst(in6, output, 176);
+  __lsx_vst(tmp7, output, 208);
+  __lsx_vst(in7, output, 240);
+}
+
+void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+  /* column transform */
+  for (i = 0; i < 2; ++i) {
+    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+  }
+
+  /* row transform */
+  for (i = 0; i < 2; ++i) {
+    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+  }
+}
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
index a6f62dbc81..9ed8102269 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.h
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -113,4 +113,170 @@
     __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1);                              \
   }
 
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                      out2, out3, out4, out5, out6, out7)                 \
+  {                                                                       \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
+    __m128i x0_m, x1_m, x2_m, x3_m;                                       \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };         \
+                                                                          \
+    /* FDCT stage1 */                                                     \
+    LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+                      s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                \
+    LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);    \
+    DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);         \
+    DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);         \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);      \
+    x1_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                        \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);      \
+    x2_m = __lsx_vneg_h(x2_m);                                            \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                        \
+                                                                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                        \
+    x2_m = __lsx_vreplvei_h(coeff_m, 2);                                  \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                        \
+                                                                          \
+    /* stage2 */                                                          \
+    s1_m = __lsx_vilvl_h(s5_m, s6_m);                                     \
+    s0_m = __lsx_vilvh_h(s5_m, s6_m);                                     \
+                                                                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                        \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                        \
+                                                                          \
+    /* stage3 */                                                          \
+    LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);    \
+                                                                          \
+    /* stage4 */                                                          \
+    DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);         \
+    DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);         \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);      \
+    x1_m = __lsx_vpackev_h(x0_m, x1_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                        \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);      \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                        \
+                                                                          \
+    x1_m = __lsx_vreplvei_h(coeff_m, 5);                                  \
+    x0_m = __lsx_vneg_h(x0_m);                                            \
+    x0_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                        \
+                                                                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 6);                                  \
+    x3_m = __lsx_vneg_h(x3_m);                                            \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                        \
+  }
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,  \
+                     input7, out1, out3, out5, out7, out9, out11, out13,      \
+                     out15)                                                   \
+  {                                                                           \
+    __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;             \
+    __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;             \
+    __m128i stp36_m, stp37_m, vec0_m, vec1_m;                                 \
+    __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                           \
+    __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m;                               \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };             \
+    __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };            \
+    __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 };                           \
+                                                                              \
+    /* stp 1 */                                                               \
+    DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \
+    DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \
+                                                                              \
+    cnst4_m = __lsx_vreplvei_h(coeff_m, 0);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m);                  \
+                                                                              \
+    cnst5_m = __lsx_vreplvei_h(coeff_m, 1);                                   \
+    cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m);                  \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m);                  \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m);                  \
+                                                                              \
+    /* stp2 */                                                                \
+    LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m,     \
+                      stp32_m, stp33_m);                                      \
+    LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m,     \
+                      stp35_m, stp34_m);                                      \
+                                                                              \
+    DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m,      \
+              vec4_m);                                                        \
+    DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m,      \
+              vec5_m);                                                        \
+                                                                              \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m);    \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m);                  \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff_m, 4);                                   \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m);                  \
+                                                                              \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m);    \
+    cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m);                  \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff_m, 3);                                   \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m);                  \
+                                                                              \
+    /* stp4 */                                                                \
+    LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m,     \
+                      vec4_m, vec5_m);                                        \
+    LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m,   \
+                      stp24_m, stp31_m);                                      \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(vec2_m, vec6_m);                                   \
+    vec0_m = __lsx_vilvh_h(vec2_m, vec6_m);                                   \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m);  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1);                     \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff2_m, 0);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15);                    \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(vec4_m, vec5_m);                                   \
+    vec0_m = __lsx_vilvh_h(vec4_m, vec5_m);                                   \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m);  \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9);                     \
+                                                                              \
+    cnst1_m = __lsx_vreplvei_h(coeff2_m, 2);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7);                     \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(stp23_m, stp21_m);                                 \
+    vec0_m = __lsx_vilvh_h(stp23_m, stp21_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m);  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5);                     \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff2_m, 1);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11);                    \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(stp24_m, stp31_m);                                 \
+    vec0_m = __lsx_vilvh_h(stp24_m, stp31_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m);  \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13);                    \
+                                                                              \
+    cnst1_m = __lsx_vreplvei_h(coeff2_m, 3);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3);                     \
+  }
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
 #endif  // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h
index bc6f7dacc9..977f1c2dd0 100644
--- a/vpx_dsp/loongarch/txfm_macros_lsx.h
+++ b/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -44,4 +44,12 @@
     out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);    \
   }
 
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3)                \
+  do {                                                           \
+    __m128i tp0_m, tp1_m;                                        \
+                                                                 \
+    DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \
+    in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS);      \
+  } while (0)
+
 #endif  // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c
index 8164e98189..8f2ec0563f 100644
--- a/vpx_dsp/loongarch/variance_lsx.c
+++ b/vpx_dsp/loongarch/variance_lsx.c
@@ -37,9 +37,50 @@
     sub = __lsx_vadd_h(sub, res_l1_m);                                    \
   }
 
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
   (sse) - (((int64_t)(diff) * (diff)) >> (shift))
 
+static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt = (height >> 2);
+  __m128i src, ref, vec;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  for (; ht_cnt--;) {
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
 static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
                                      const uint8_t *ref_ptr, int32_t ref_stride,
                                      int32_t height, int32_t *diff) {
@@ -133,8 +174,10 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
   return HADD_SW_S32(var);
 }
 
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
 
 #define VPX_VARIANCE_WDXHT_LSX(wd, ht)                                         \
   uint32_t vpx_variance##wd##x##ht##_lsx(                                      \
@@ -148,6 +191,7 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
   }
 
+VPX_VARIANCE_WDXHT_LSX(16, 16)
 VPX_VARIANCE_WDXHT_LSX(32, 32)
 
 uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
@@ -159,3 +203,9 @@ uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
 
   return VARIANCE_64Wx64H(*sse, diff);
 }
+
+void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                         int32_t *sum) {
+  *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum);
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 5c3ffe97d4..efb253c68d 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -236,6 +236,7 @@ DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 932099243f..4ad698cabe 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -585,7 +585,7 @@ ()
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 neon sse2 msa/;
+  specialize qw/vpx_fdct16x16 neon sse2 msa lsx/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
@@ -1099,7 +1099,7 @@ ()
   specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
@@ -1123,7 +1123,7 @@ ()
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/;
+  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx lsx/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
   specialize qw/vpx_get8x8var sse2 neon msa vsx/;

From 19b45a26c62170c1fb0dfd18a083ddb84ef7e4a4 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 18:42:24 -0700
Subject: [PATCH 272/926] vp9,encode_tiles_buffer_alloc: fix allocation check

previously vp9_bitstream_worker_data was checked after it was memset();
this change uses CHECK_MEM_ERROR for consistency to ensure the pointer
is checked first

Change-Id: I532d0eb0e746dc6b8d694b616eba693c5c0053ac
---
 vp9/encoder/vp9_bitstream.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 99cc2ee831..75bd097f24 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -963,21 +963,20 @@ void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
   }
 }
 
-static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
   int i;
   const size_t worker_data_size =
       cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
-  cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size);
+  CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data,
+                  vpx_memalign(16, worker_data_size));
   memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
-  if (!cpi->vp9_bitstream_worker_data) return 1;
   for (i = 1; i < cpi->num_workers; ++i) {
     cpi->vp9_bitstream_worker_data[i].dest_size =
         cpi->oxcf.width * cpi->oxcf.height;
-    cpi->vp9_bitstream_worker_data[i].dest =
-        vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
-    if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
+    CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest,
+                    vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
   }
-  return 0;
 }
 
 static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
@@ -992,7 +991,7 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
       cpi->vp9_bitstream_worker_data[1].dest_size >
           (cpi->oxcf.width * cpi->oxcf.height)) {
     vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
-    if (encode_tiles_buffer_alloc(cpi)) return 0;
+    encode_tiles_buffer_alloc(cpi);
   }
 
   while (tile_col < tile_cols) {

From 1b70db4be90e66fdd0473f34ad7bec69f269edeb Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 19:05:56 -0700
Subject: [PATCH 273/926] vp9: check postproc_state.limits allocs

Change-Id: I9d5df96580074375e4847d2e2f60a6a6d56eeea5
---
 vp9/common/vp9_postproc.c | 1 +
 vp9/encoder/vp9_encoder.c | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index d2c8535b01..96519f0051 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -360,6 +360,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
     if (!cm->postproc_state.limits) {
       cm->postproc_state.limits =
           vpx_calloc(unscaled_width, sizeof(*cm->postproc_state.limits));
+      if (!cm->postproc_state.limits) return 1;
     }
   }
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index ec6a756197..89b7c8e246 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3684,9 +3684,9 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
       case 6: l = 150; break;
     }
     if (!cpi->common.postproc_state.limits) {
-      cpi->common.postproc_state.limits =
-          vpx_calloc(cpi->un_scaled_source->y_width,
-                     sizeof(*cpi->common.postproc_state.limits));
+      CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits,
+                      vpx_calloc(cpi->un_scaled_source->y_width,
+                                 sizeof(*cpi->common.postproc_state.limits)));
     }
     vp9_denoise(&cpi->common, cpi->Source, cpi->Source, l,
                 cpi->common.postproc_state.limits);

From e93e2ca0e33fb04f54724c4df6526727e7399841 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 19:07:08 -0700
Subject: [PATCH 274/926] vp9_enc_grp_get_next_job: check job queue alloc

+ reverse conditional order; var == constant is more readable

Change-Id: I9f2b4394024c262fd5fe9576a8bf33afe197c050
---
 vp9/encoder/vp9_multi_thread.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index c66c035492..6078f8975c 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -36,7 +36,7 @@ void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
   pthread_mutex_lock(mutex_handle);
 #endif
   next = job_queue_hdl->next;
-  if (NULL != next) {
+  if (next != NULL) {
     JobQueue *job_queue = (JobQueue *)next;
     job_info = &job_queue->job_info;
     // Update the next job in the queue
@@ -84,8 +84,8 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
   multi_thread_ctxt->allocated_tile_rows = tile_rows;
   multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
 
-  multi_thread_ctxt->job_queue =
-      (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue));
+  CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue,
+                  (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
 
 #if CONFIG_MULTITHREAD
   // Create mutex for each tile

From 72fa1d505ed4eaf2660e35347aba768502f268c4 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 19:13:48 -0700
Subject: [PATCH 275/926] vp9_alloc_motion_field_info: check motion_field_array
 alloc

Change-Id: I4ae11242e645feb3b85eaea186f14b3676ae40a8
---
 vp9/encoder/vp9_non_greedy_mv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vp9/encoder/vp9_non_greedy_mv.c b/vp9/encoder/vp9_non_greedy_mv.c
index 4679d6c49c..1c0d281495 100644
--- a/vp9/encoder/vp9_non_greedy_mv.c
+++ b/vp9/encoder/vp9_non_greedy_mv.c
@@ -178,6 +178,7 @@ Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info,
   motion_field_info->frame_num = frame_num;
   motion_field_info->motion_field_array =
       vpx_calloc(frame_num, sizeof(*motion_field_info->motion_field_array));
+  if (!motion_field_info->motion_field_array) return STATUS_FAILED;
   for (frame_idx = 0; frame_idx < frame_num; ++frame_idx) {
     for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
       for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;

From 58fff2f9ef25c13da150cdfb366a351705db5776 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 19:14:44 -0700
Subject: [PATCH 276/926] vp9_speed_features.c: check allocations

Change-Id: If3b319c1ce7036c2259440f4eeb2e645bf559f4c
---
 vp9/encoder/vp9_speed_features.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7d7b2c3fb4..0431d8a452 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -652,8 +652,10 @@ static void set_rt_speed_feature_framesize_independent(
       if (cpi->content_state_sb_fd == NULL &&
           (!cpi->use_svc ||
            svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
-        cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
-            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
+        CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd,
+                        (uint8_t *)vpx_calloc(
+                            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                            sizeof(uint8_t)));
       }
     }
     if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) {
@@ -804,14 +806,17 @@ static void set_rt_speed_feature_framesize_independent(
       sf->partition_search_type = FIXED_PARTITION;
       sf->always_this_block_size = BLOCK_64X64;
     }
-    if (cpi->count_arf_frame_usage == NULL)
-      cpi->count_arf_frame_usage =
+    if (cpi->count_arf_frame_usage == NULL) {
+      CHECK_MEM_ERROR(
+          cm, cpi->count_arf_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
-                                sizeof(*cpi->count_arf_frame_usage));
+                                sizeof(*cpi->count_arf_frame_usage)));
+    }
     if (cpi->count_lastgolden_frame_usage == NULL)
-      cpi->count_lastgolden_frame_usage =
+      CHECK_MEM_ERROR(
+          cm, cpi->count_lastgolden_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
-                                sizeof(*cpi->count_lastgolden_frame_usage));
+                                sizeof(*cpi->count_lastgolden_frame_usage)));
   }
   if (svc->previous_frame_is_intra_only) {
     sf->partition_search_type = FIXED_PARTITION;

From a5ad89018eecef202d4ae5853ecdde843c0a9880 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 19:15:40 -0700
Subject: [PATCH 277/926] VP9RateControlRTC::Create: check segmentation_map
 alloc

Change-Id: I17b23915c32accf834def5ab26a8e4e188f9993a
---
 vp9/ratectrl_rtc.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 76ff367c06..f4d7f7e9e7 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -25,7 +25,10 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
                                                 VP9RateControlRTC());
   if (!rc_api) return nullptr;
   rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
-  if (!rc_api->cpi_) return nullptr;
+  if (!rc_api->cpi_) {
+    rc_api.reset();
+    return nullptr;
+  }
   vp9_zero(*rc_api->cpi_);
 
   rc_api->InitRateControl(cfg);
@@ -34,6 +37,10 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
     cpi->segmentation_map = static_cast<uint8_t *>(
         vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
                    sizeof(*cpi->segmentation_map)));
+    if (!cpi->segmentation_map) {
+      rc_api.reset();
+      return nullptr;
+    }
     cpi->cyclic_refresh =
         vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
     cpi->cyclic_refresh->content_mode = 0;

From b2d57a88086410c8beb3696374764c4e836fe332 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 19:16:11 -0700
Subject: [PATCH 278/926] simple_encode,init_encoder: check buffer_pool alloc

Change-Id: I54f83733260abf828166400c5fd0c4c7e3ccec2f
---
 vp9/simple_encode.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 1a0ada119f..654699e1b2 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -110,6 +110,7 @@ static VP9_COMP *init_encoder(const VP9EncoderConfig *oxcf,
                               vpx_img_fmt_t img_fmt) {
   VP9_COMP *cpi;
   BufferPool *buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(*buffer_pool));
+  if (!buffer_pool) return NULL;
   vp9_initialize_enc();
   cpi = vp9_create_compressor(oxcf, buffer_pool);
   vp9_update_compressor_with_img_fmt(cpi, img_fmt);

From e82c5a85c9fcb727a591ffa63fc08bcaf52f9da3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 22:18:21 -0700
Subject: [PATCH 279/926] vp9_row_mt_alloc_rd_thresh: check alloc

Change-Id: I6fb7771d9fa6ec54d81f24a02a289e8b852e7332
---
 vp9/encoder/vp9_multi_thread.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index 6078f8975c..45659f2a9a 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -58,9 +58,10 @@ void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
       (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
   int i;
 
-  this_tile->row_base_thresh_freq_fact =
+  CHECK_MEM_ERROR(
+      cm, this_tile->row_base_thresh_freq_fact,
       (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
-                        sizeof(*(this_tile->row_base_thresh_freq_fact)));
+                        sizeof(*(this_tile->row_base_thresh_freq_fact))));
   for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
     this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
 }

From c152584107a18a0e240a23279f1f1dfcd80a3acf Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 22:19:05 -0700
Subject: [PATCH 280/926] vp9_get_smooth_motion_field: check alloc

Change-Id: I6b19d0169d127f622abf97b3b8590eee957bdc51
---
 vp9/encoder/vp9_non_greedy_mv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/encoder/vp9_non_greedy_mv.c b/vp9/encoder/vp9_non_greedy_mv.c
index 1c0d281495..d52801c845 100644
--- a/vp9/encoder/vp9_non_greedy_mv.c
+++ b/vp9/encoder/vp9_non_greedy_mv.c
@@ -423,6 +423,7 @@ void vp9_get_smooth_motion_field(const MV *search_mf,
   int row, col;
   int bw = 4 << b_width_log2_lookup[bsize];
   int bh = 4 << b_height_log2_lookup[bsize];
+  if (!(input && output)) goto fail;
   // copy search results to input buffer
   for (idx = 0; idx < rows * cols; ++idx) {
     input[idx].row = (float)search_mf[idx].row / bh;
@@ -451,6 +452,7 @@ void vp9_get_smooth_motion_field(const MV *search_mf,
     smooth_mf[idx].row = (int)(input[idx].row * bh);
     smooth_mf[idx].col = (int)(input[idx].col * bw);
   }
+fail:
   free(input);
   free(output);
 }

From c3d2df2f2f810b1c1bd9bd6bf0a54d20b4e6dacc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 22:19:34 -0700
Subject: [PATCH 281/926] fastssim,fs_ctx_init: check alloc

Change-Id: Ie087e8be1e943b94327ed520db447a0e3a927738
---
 vpx_dsp/fastssim.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/fastssim.c b/vpx_dsp/fastssim.c
index 6ab6f557e2..4d32a02a55 100644
--- a/vpx_dsp/fastssim.c
+++ b/vpx_dsp/fastssim.c
@@ -47,7 +47,7 @@ struct fs_ctx {
   unsigned *col_buf;
 };
 
-static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
   unsigned char *data;
   size_t data_size;
   int lw;
@@ -71,6 +71,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lh = (lh + 1) >> 1;
   }
   data = (unsigned char *)malloc(data_size);
+  if (!data) return -1;
   _ctx->level = (fs_level *)data;
   _ctx->nlevels = _nlevels;
   data += _nlevels * sizeof(*_ctx->level);
@@ -95,6 +96,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lh = (lh + 1) >> 1;
   }
   _ctx->col_buf = (unsigned *)data;
+  return 0;
 }
 
 static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
@@ -456,7 +458,7 @@ static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
   double ret;
   int l;
   ret = 1;
-  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0;
   fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
                        _shift);
   for (l = 0; l < FS_NLEVELS - 1; l++) {

From 8baaa7b5a3fcea958261bda667a871c163e93bf9 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Apr 2022 22:20:00 -0700
Subject: [PATCH 282/926] y4m_input_open: check allocs

Change-Id: I99ee0ef3ab28a22923cb413ccf5935fdc38862be
---
 y4minput.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/y4minput.c b/y4minput.c
index 9a4bdbd7b5..7d3c03a7fc 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -1087,9 +1087,15 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
     y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz);
   else
     y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz);
+  if (!y4m_ctx->dst_buf) return -1;
 
-  if (y4m_ctx->aux_buf_sz > 0)
+  if (y4m_ctx->aux_buf_sz > 0) {
     y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz);
+    if (!y4m_ctx->aux_buf) {
+      free(y4m_ctx->dst_buf);
+      return -1;
+    }
+  }
   return 0;
 }
 

From b1ed8e08a21b33c0f5039559113004bee7943dc4 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Thu, 7 Apr 2022 17:51:51 +0800
Subject: [PATCH 283/926] vp9[loongarch]: Optimize
 sad64x64/32x32_avg,comp_avg_pred

1. vpx_sad64x64_avg_lsx
2. vpx_sad32x32_avg_lsx
3. comp_avg_pred_lsx

Bug: webm:1755

Change-Id: I58dabdcdd4265bd6ebd5670db8a132d2e838683f
---
 test/comp_avg_pred_test.cc       |   5 +
 test/sad_test.cc                 |   6 ++
 vpx_dsp/loongarch/avg_pred_lsx.c |  83 ++++++++++++++
 vpx_dsp/loongarch/sad_lsx.c      | 180 ++++++++++++++++++++++++++++++-
 vpx_dsp/vpx_dsp.mk               |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl     |   6 +-
 6 files changed, 274 insertions(+), 7 deletions(-)
 create mode 100644 vpx_dsp/loongarch/avg_pred_lsx.c

diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index b9201a20f9..3977a2d0b5 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -183,4 +183,9 @@ INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest,
 INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest,
                          ::testing::Values(&vpx_comp_avg_pred_vsx));
 #endif  // HAVE_VSX
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest,
+                         ::testing::Values(&vpx_comp_avg_pred_lsx));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/test/sad_test.cc b/test/sad_test.cc
index e4952ba9f7..12a6206b95 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1139,6 +1139,12 @@ const SadMxNParam lsx_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests));
 
+const SadMxNAvgParam avg_lsx_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_lsx),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests));
+
 const SadMxNx4Param x4d_lsx_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx),
diff --git a/vpx_dsp/loongarch/avg_pred_lsx.c b/vpx_dsp/loongarch/avg_pred_lsx.c
new file mode 100644
index 0000000000..482626080a
--- /dev/null
+++ b/vpx_dsp/loongarch/avg_pred_lsx.c
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+                           int height, const uint8_t *ref, int ref_stride) {
+  // width > 8 || width == 8 || width == 4
+  if (width > 8) {
+    int i, j;
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        __m128i p, r, avg;
+
+        p = __lsx_vld(pred + j, 0);
+        r = __lsx_vld(ref + j, 0);
+        avg = __lsx_vavgr_bu(p, r);
+        __lsx_vst(avg, comp_pred + j, 0);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    int i = height * width;
+    do {
+      __m128i p, r, r_0, r_1;
+
+      p = __lsx_vld(pred, 0);
+      r_0 = __lsx_vld(ref, 0);
+      ref += ref_stride;
+      r_1 = __lsx_vld(ref, 0);
+      ref += ref_stride;
+      r = __lsx_vilvl_d(r_1, r_0);
+      r = __lsx_vavgr_bu(p, r);
+
+      __lsx_vst(r, comp_pred, 0);
+
+      pred += 16;
+      comp_pred += 16;
+      i -= 16;
+    } while (i);
+  } else {  // width = 4
+    int i = height * width;
+    assert(width == 4);
+    do {
+      __m128i p, r, r_0, r_1, r_2, r_3;
+      p = __lsx_vld(pred, 0);
+
+      if (width == ref_stride) {
+        r = __lsx_vld(ref, 0);
+        ref += 16;
+      } else {
+        r_0 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_1 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_2 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_3 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2);
+        r = __lsx_vilvl_d(r_2, r_0);
+      }
+      r = __lsx_vavgr_bu(p, r);
+
+      __lsx_vst(r, comp_pred, 0);
+      comp_pred += 16;
+      pred += 16;
+      i -= 16;
+    } while (i);
+  }
+}
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
index cd3f2d46bb..30464b3661 100644
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -46,6 +46,17 @@
     sum_m;                              \
   })
 
+#define HADD_SW_S32(in)                        \
+  ({                                           \
+    __m128i res0_m;                            \
+    int32_t sum_m;                             \
+                                               \
+    res0_m = __lsx_vhaddw_d_w(in, in);         \
+    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+    sum_m = __lsx_vpickve2gr_w(res0_m, 0);     \
+    sum_m;                                     \
+  })
+
 static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
                                 const uint8_t *ref, int32_t ref_stride,
                                 int32_t height) {
@@ -355,7 +366,150 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
   sad_array[3] = HADD_UW_U32(sad);
 }
 
-#define VPX_SAD_16xHEIGHT_LSX(height)                                         \
+static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i comp0, comp1, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+  uint8_t *src_tmp, *ref_tmp;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  for (; ht_cnt--;) {
+    src_tmp = (uint8_t *)src + 16;
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src1 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp, src_stride3);
+    src += src_stride4;
+
+    ref_tmp = (uint8_t *)ref + 16;
+    ref0 = __lsx_vld(ref, 0);
+    DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
+    ref6 = __lsx_vldx(ref, ref_stride3);
+    ref1 = __lsx_vld(ref_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
+              ref5);
+    ref7 = __lsx_vldx(ref_tmp, ref_stride3);
+    ref += ref_stride4;
+
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
+              pred0, pred2, pred4, pred6);
+    DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
+              112, pred1, pred3, pred5, pred7);
+    sec_pred += 128;
+
+    DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
+    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
+    sad_tmp = SAD_UB2_UH(src2, src3, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
+    sad_tmp = SAD_UB2_UH(src4, src5, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
+    sad_tmp = SAD_UB2_UH(src6, src7, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
+  __m128i sad, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+  }
+  sad = __lsx_vhaddw_wu_hu(sad0, sad0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+
+  return HADD_SW_S32(sad);
+}
+
+#define VPX_SAD_16xHT_LSX(height)                                             \
   uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
                                     const uint8_t *ref, int32_t ref_stride) { \
     return sad_16width_lsx(src, src_stride, ref, ref_stride, height);         \
@@ -394,15 +548,33 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
     sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
-#define SAD64 VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64)
+#define VPX_AVGSAD_32xHT_LSX(height)                                    \
+  uint32_t vpx_sad32x##height##_avg_lsx(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define VPX_AVGSAD_64xHT_LSX(height)                                    \
+  uint32_t vpx_sad64x##height##_avg_lsx(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define SAD64 \
+  VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_AVGSAD_64xHT_LSX(64)
 
 SAD64
 
-#define SAD32 VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32)
+#define SAD32 \
+  VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_AVGSAD_32xHT_LSX(32)
 
 SAD32
 
-#define SAD16 VPX_SAD_16xHEIGHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
+#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
 
 SAD16
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index efb253c68d..ddccfc1f4a 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -401,6 +401,7 @@ DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
 DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/sub_pixel_variance_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/avg_pred_lsx.c
 
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 4ad698cabe..68d4f86f2f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -836,7 +836,7 @@ ()
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
@@ -845,7 +845,7 @@ ()
 specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
@@ -1147,7 +1147,7 @@ ()
   specialize qw/vpx_get4x4sse_cs neon msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred neon sse2 vsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;
 
 #
 # Subpixel Variance

From 1b00ad52630a0379d2df16a4fc7351f4e3d0896e Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 12 Apr 2022 09:10:27 +0800
Subject: [PATCH 284/926] vp9[loongarch]: Optimize sad8x8/32x64/64x32x4d

1. vpx_sad8x8x4d_lsx
2. vpx_sad32x64x4d_lsx
3. vpx_sad64x32x4d_lsx

Bug: webm:1755

Change-Id: I08a2b8717ec8623ffdd4451a04e68fa3a7228668
---
 test/sad_test.cc             |  3 ++
 vpx_dsp/loongarch/sad_lsx.c  | 97 ++++++++++++++++++++++++++++++++++--
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  6 +--
 3 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 12a6206b95..7ce25343f6 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1147,8 +1147,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests));
 
 const SadMxNx4Param x4d_lsx_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_lsx),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_lsx),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx),
   SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_lsx),
 };
 INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests));
 #endif  // HAVE_LSX
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
index 30464b3661..4764acbf88 100644
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -165,6 +165,81 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
   return sad;
 }
 
+static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
+  int32_t ht_cnt = (height >> 2);
+  uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  __m128i src0, src1, src2, src3, sad_tmp;
+  __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    src0 = __lsx_vld(src_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_ptr, src_stride3);
+    src_ptr += src_stride4;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
+              ref2);
+    ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
+    ref0_ptr += ref_stride4;
+    ref4 = __lsx_vld(ref1_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
+              ref6);
+    ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
+    ref1_ptr += ref_stride4;
+    ref8 = __lsx_vld(ref2_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
+              ref10);
+    ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
+    ref2_ptr += ref_stride4;
+    ref12 = __lsx_vld(ref3_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
+              ref14);
+    ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
+    ref3_ptr += ref_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
 static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
                                 const uint8_t *const aref_ptr[],
                                 int32_t ref_stride, int32_t height,
@@ -527,6 +602,13 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
     return sad_64width_lsx(src, src_stride, ref, ref_stride, height);         \
   }
 
+#define VPX_SAD_8xHTx4D_LSX(height)                                       \
+  void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
+    sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
 #define VPX_SAD_16xHTx4D_LSX(height)                                       \
   void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
                                    const uint8_t *const refs[],            \
@@ -564,13 +646,15 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
                               second_pred);                             \
   }
 
-#define SAD64 \
-  VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_AVGSAD_64xHT_LSX(64)
+#define SAD64                                                             \
+  VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
+      VPX_AVGSAD_64xHT_LSX(64)
 
 SAD64
 
-#define SAD32 \
-  VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_AVGSAD_32xHT_LSX(32)
+#define SAD32                                                             \
+  VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
+      VPX_AVGSAD_32xHT_LSX(32)
 
 SAD32
 
@@ -578,6 +662,11 @@ SAD32
 
 SAD16
 
+#define SAD8 VPX_SAD_8xHTx4D_LSX(8)
+
+SAD8
+
 #undef SAD64
 #undef SAD32
 #undef SAD16
+#undef SAD8
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 68d4f86f2f..b441b337b4 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -881,10 +881,10 @@ ()
 specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/;
@@ -905,7 +905,7 @@ ()
 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/;
 
 add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;

From 872732b2c90eda09f6db1a21b5eee6dc36e813f3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 28 Apr 2022 17:45:47 -0700
Subject: [PATCH 285/926] examples: add missing argv_dup alloc checks

Change-Id: Ia3080cbf50071d599c7168a20466392a963f101a
---
 args.c                             | 1 +
 examples/vp9_spatial_svc_encoder.c | 4 ++++
 vpxdec.c                           | 9 ++++++++-
 vpxenc.c                           | 4 ++++
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/args.c b/args.c
index 17b615584e..4afb9c021a 100644
--- a/args.c
+++ b/args.c
@@ -83,6 +83,7 @@ const char *arg_next(struct arg *arg) {
 
 char **argv_dup(int argc, const char **argv) {
   char **new_argv = malloc((argc + 1) * sizeof(*argv));
+  if (!new_argv) return NULL;
 
   memcpy(new_argv, argv, argc * sizeof(*argv));
   new_argv[argc] = NULL;
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index 455f6c9036..c45edb9ae0 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -222,6 +222,10 @@ static void parse_command_line(int argc, const char **argv_,
 
   // process command line options
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    exit(EXIT_FAILURE);
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
diff --git a/vpxdec.c b/vpxdec.c
index 363eb1a24b..84cef7dfd4 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -581,7 +581,10 @@ static int main_loop(int argc, const char **argv_) {
   /* Parse command line */
   exec_name = argv_[0];
   argv = argv_dup(argc - 1, argv_ + 1);
-
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
@@ -1123,6 +1126,10 @@ int main(int argc, const char **argv_) {
   int error = 0;
 
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
diff --git a/vpxenc.c b/vpxenc.c
index b64b6cf441..7eff97b132 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1703,6 +1703,10 @@ int main(int argc, const char **argv_) {
    * codec.
    */
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   parse_global_config(&global, argv);
 
   if (argc < 3) usage_exit();

From 8ac72859e16934df6e598283997d141e9493fc05 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 3 May 2022 10:44:49 -0400
Subject: [PATCH 286/926] vp9 svc sample: set fps from y4m file

Change-Id: I082c0409910da4cda5bf852b20ffa11ba5c2ebd6
---
 examples/vp9_spatial_svc_encoder.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index c45edb9ae0..e85dbf8e71 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -361,6 +361,8 @@ static void parse_command_line(int argc, const char **argv_,
   if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) {
     enc_cfg->g_w = app_input->input_ctx.width;
     enc_cfg->g_h = app_input->input_ctx.height;
+    enc_cfg->g_timebase.den = app_input->input_ctx.framerate.numerator;
+    enc_cfg->g_timebase.num = app_input->input_ctx.framerate.denominator;
   }
 
   if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||

From f3b4c9a8f65fb8f35d0e77d2fa62bcd075bbd738 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 6 May 2022 11:47:06 -0700
Subject: [PATCH 287/926] vp8[cd]x.h: document vpx_codec_vp[89]_[cd]x*

+ mark the _algo variables as deprecated.
this quiets some doxygen warnings

Bug: webm:1752
Change-Id: I53b9b796c3d8fef5c713ee4278641198f95b5864
---
 vpx/vp8cx.h | 16 ++++++++++++++++
 vpx/vp8dx.h | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 5665a5f036..f5dc6d1188 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -33,7 +33,15 @@ extern "C" {
  * This interface provides the capability to encode raw VP8 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP8 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp8_cx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp8_cx_algo;
+
+/*!\brief The interface to the VP8 encoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -42,7 +50,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
  * This interface provides the capability to encode raw VP9 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP9 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp9_cx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp9_cx_algo;
+
+/*!\brief The interface to the VP9 encoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 /*!@} - end algorithm interface member group*/
 
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 506a8936be..8c13649f4a 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -32,7 +32,15 @@ extern "C" {
  * This interface provides the capability to decode VP8 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP8 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp8_dx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp8_dx_algo;
+
+/*!\brief The interface to the VP8 decoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -41,7 +49,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
  * This interface provides the capability to decode VP9 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP9 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp9_dx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp9_dx_algo;
+
+/*!\brief The interface to the VP9 decoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
 

From cb1abee1455ac7e552da271ac64c71d117caaa77 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 6 May 2022 11:55:56 -0700
Subject: [PATCH 288/926] add some missing realloc checks

Change-Id: I0fd1e094085c18b1d9a32333e876c2affeb6de23
---
 examples/twopass_encoder.c | 1 +
 test/vp9_ethread_test.cc   | 1 +
 tools/tiny_ssim.c          | 4 ++++
 3 files changed, 6 insertions(+)

diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 07ba37dfd0..07a10d9cf3 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -84,6 +84,7 @@ static int get_frame_stats(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
       stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      if (!stats->buf) die("Failed to reallocate stats buffer.");
       memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
       stats->sz += pkt_size;
     }
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 21caf7918c..238366cb60 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -98,6 +98,7 @@ class VPxFirstPassEncoderThreadTest
 
     firstpass_stats_.buf =
         realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+    ASSERT_NE(firstpass_stats_.buf, nullptr);
     memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
            pkt_size);
     firstpass_stats_.sz += pkt_size;
diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c
index 1577970488..8fba814621 100644
--- a/tools/tiny_ssim.c
+++ b/tools/tiny_ssim.c
@@ -453,6 +453,10 @@ int main(int argc, char *argv[]) {
       psnry = realloc(psnry, allocated_frames * sizeof(*psnry));
       psnru = realloc(psnru, allocated_frames * sizeof(*psnru));
       psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv));
+      if (!(ssimy && ssimu && ssimv && psnry && psnru && psnrv)) {
+        fprintf(stderr, "Error allocating SSIM/PSNR data.\n");
+        exit(EXIT_FAILURE);
+      }
     }
     psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h);
     psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2,

From 258affdeab68ed59e181368baa46e2f1d077b0ab Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Tue, 3 May 2022 12:24:44 +0300
Subject: [PATCH 289/926] [NEON] Optimize vp9_diamond_search_sad() for NEON

About 50% improvement in comparison to the C function.
I have followed the AVX version with some simplifications.

Change-Id: I72ddbdb2fbc5ed8a7f0210703fe05523a37db1c9
---
 vp9/common/vp9_rtcd_defs.pl                   |   2 +-
 .../arm/neon/vp9_diamond_search_sad_neon.c    | 322 ++++++++++++++++++
 vp9/vp9cx.mk                                  |   1 +
 3 files changed, 324 insertions(+), 1 deletion(-)
 create mode 100644 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 4da0b6675b..e6b65c96f0 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -175,7 +175,7 @@ ()
 # Motion search
 #
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx/;
+specialize qw/vp9_diamond_search_sad avx neon/;
 
 #
 # Apply temporal filter
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
new file mode 100644
index 0000000000..e56733d43e
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -0,0 +1,322 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+
+static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
+  // This is simplified from the C implementation to utilise that
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
+  return mv.as_int == 0 ? 0 : 1;
+}
+
+static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
+  assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
+  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
+         comp_cost[1][mv.as_mv.col];
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
+                          int sad_per_bit) {
+  const int_mv diff =
+      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+      VP9_PROB_COST_SHIFT);
+}
+
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
+                                const search_site_config *cfg, MV *ref_mv,
+                                MV *best_mv, int search_param, int sad_per_bit,
+                                int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+                                const MV *center_mv) {
+  static const uint32_t data[4] = { 0, 1, 2, 3 };
+  const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
+
+  const int32x4_t zero_s32 = vdupq_n_s32(0);
+  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+  const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
+  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+  const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
+
+  const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
+
+  const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
+  const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv =
+      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+  const int16x8_t vfcmv = vdupq_n_s16(fcenter_mv.as_int);
+
+  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
+  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what =
+      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if defined(__aarch64__)
+  int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+  int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+  unsigned int best_sad = INT_MAX;
+  int i, j, step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  // Check the starting position
+  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
+  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      int16x8_t v_diff_mv_w;
+      int8x16_t v_inside_d;
+      uint32x4_t v_outside_d;
+      int32x4_t v_cost_d, v_sad_d;
+#if defined(__aarch64__)
+      int64x2_t v_blocka[2];
+#else
+      int32x4_t v_blocka[1];
+      uint32x2_t horiz_max_0, horiz_max_1;
+#endif
+
+      uint32_t horiz_max;
+      // Compute the candidate motion vectors
+      const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
+      const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      int16x8_t v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = vreinterpretq_s8_u32(
+          vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
+                    vreinterpretq_s32_s16(v_these_mv_w)));
+
+      // If none of them are inside, then move on
+#if defined(__aarch64__)
+      horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
+#else
+      horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
+                             vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
+      horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
+      vst1_lane_u32(&horiz_max, horiz_max_1, 0);
+#endif
+      if (LIKELY(horiz_max == 0)) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d =
+          vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = vshrq_n_u32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = vabsq_s16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if defined(__aarch64__)  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
+        int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = vandq_s64(
+            v_bo10_q,
+            vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
+        v_bo32_q = vandq_s64(
+            v_bo32_q,
+            vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
+        // Compute the candidate addresses
+        v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
+        v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
+#else  // sizeof(intptr_t) == 4
+        int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
+        v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
+        v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                     in_what_stride, (uint32_t *)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        uint32_t cost[4];
+        int16_t __attribute__((aligned(16))) rowcol[8];
+        vst1q_s16(rowcol, v_diff_mv_w);
+
+        // Note: This is a use case for gather instruction
+        cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
+        cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
+        cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
+        cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
+
+        v_cost_d = vld1q_s32((int32_t *)cost);
+      }
+
+      // Now add in the joint cost
+      {
+        const uint32x4_t v_sel_d =
+            vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
+        const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
+            vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
+                     vreinterpretq_u8_s32(v_joint_cost_0_d),
+                     vreinterpretq_u8_s32(v_joint_cost_1_d)));
+        v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+      v_cost_d =
+          vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
+      v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
+      // Add the cost to the sad
+      v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        uint32_t local_best_sad;
+#if defined(__aarch64__)
+        local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
+#else
+        uint32x2_t horiz_min_0 =
+            vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
+                     vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
+        uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+        vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
+#endif
+
+        // Update the global minimum if the local minimum is smaller
+        if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+          uint32_t local_best_idx;
+          const uint32x4_t v_sel_d =
+              vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
+          uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
+          v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
+
+#if defined(__aarch64__)
+          local_best_idx = vminvq_u32(v_mask_d);
+#else
+          horiz_min_0 =
+              vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
+          horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+          vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
+#endif
+
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+#if defined(__aarch64__)
+    v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+    v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+
+    if (UNLIKELY(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 92a7fddb9d..c9afd9a347 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -113,6 +113,7 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c

From a6bff83a603affa2799bbacedc24f9ca8632a5c6 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 12 May 2022 11:18:19 -0700
Subject: [PATCH 290/926] vp9-rtc: Fix to interp_filter for segment skip

For segment skip feature: allow for setting the
mi->interp_filter to BILINEAR, if cm->interp_filter
is set BILIENAR. This can happen at speed 9 when the
segment skip feature is used (e.g., active_maps)

Without this fix the assert can be triggered with the
active_map_test.cc for speed 9 included.
Updated the test.

Fixes the assert triggered in the issue:
Bug: webm:1762

Change-Id: I462e0bdd966e4f3cb5b7bc746685916ac8808358
---
 test/active_map_test.cc       | 3 ++-
 vp9/encoder/vp9_encodeframe.c | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 9c55f9a8b1..7f41009e0f 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -37,6 +37,7 @@ class ActiveMapTest
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
     } else if (video->frame() == 3) {
       vpx_active_map_t map = vpx_active_map_t();
       /* clang-format off */
@@ -87,5 +88,5 @@ TEST_P(ActiveMapTest, Test) {
 
 VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest,
                            ::testing::Values(::libvpx_test::kRealTime),
-                           ::testing::Range(0, 9));
+                           ::testing::Range(0, 10));
 }  // namespace
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index fc4089865d..9da8f61e32 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1905,13 +1905,17 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
 }
 
 static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   INTERP_FILTER interp_filter,
                                    RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   INTERP_FILTER filter_ref;
 
   filter_ref = get_pred_context_switchable_interp(xd);
-  if (filter_ref == SWITCHABLE_FILTERS) filter_ref = EIGHTTAP;
+  if (interp_filter == BILINEAR)
+    filter_ref = BILINEAR;
+  else if (filter_ref == SWITCHABLE_FILTERS)
+    filter_ref = EIGHTTAP;
 
   mi->sb_type = bsize;
   mi->mode = ZEROMV;
@@ -4682,7 +4686,7 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
                                 mi_col);
   else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
-    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
+    set_mode_info_seg_skip(x, cm->tx_mode, cm->interp_filter, rd_cost, bsize);
   else if (bsize >= BLOCK_8X8) {
     if (cpi->rc.hybrid_intra_scene_change)
       hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,

From 617698706ccff16b43091edb9cf94d2d3eda7c5f Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 12 May 2022 21:05:24 -0400
Subject: [PATCH 291/926] Add aq mode 0 and 3 to active map test

Bug: webm:1762
Change-Id: Ia827f6686e8d0cdc09f3d07d07dacaa4fcd801ab
---
 test/active_map_test.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 7f41009e0f..543ec0d358 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -19,7 +19,8 @@ namespace {
 
 class ActiveMapTest
     : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, int,
+                                                 int> {
  protected:
   static const int kWidth = 208;
   static const int kHeight = 144;
@@ -37,7 +38,7 @@ class ActiveMapTest
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
-      encoder->Control(VP9E_SET_AQ_MODE, 3);
+      encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3));
     } else if (video->frame() == 3) {
       vpx_active_map_t map = vpx_active_map_t();
       /* clang-format off */
@@ -88,5 +89,5 @@ TEST_P(ActiveMapTest, Test) {
 
 VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest,
                            ::testing::Values(::libvpx_test::kRealTime),
-                           ::testing::Range(0, 10));
+                           ::testing::Range(5, 10), ::testing::Values(0, 3));
 }  // namespace

From 0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 12 Apr 2022 16:02:55 +0800
Subject: [PATCH 292/926] vp9[loongarch]: Optimize vpx_hadamard_16x16/8x8

1. vpx_hadamard_16x16_lsx
2. vpx_hadamard_8x8_lsx

Bug: webm:1755

Change-Id: I3b1e0a2c026c3806b7bbbd191d0edf0e78912af7
---
 test/hadamard_test.cc                       |  7 ++
 vpx_dsp/loongarch/avg_lsx.c                 | 90 +++++++++++++++++++++
 vpx_dsp/loongarch/bitdepth_conversion_lsx.h | 48 +++++++++++
 vpx_dsp/vpx_dsp.mk                          |  4 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl                |  8 +-
 5 files changed, 153 insertions(+), 4 deletions(-)
 create mode 100644 vpx_dsp/loongarch/avg_lsx.c
 create mode 100644 vpx_dsp/loongarch/bitdepth_conversion_lsx.h

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index dab945a561..10b1e79c10 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -285,6 +285,13 @@ INSTANTIATE_TEST_SUITE_P(
                       HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16)));
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16)));
+#endif  // HAVE_LSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
 class HadamardHighbdTest : public HadamardTestBase {
  protected:
diff --git a/vpx_dsp/loongarch/avg_lsx.c b/vpx_dsp/loongarch/avg_lsx.c
new file mode 100644
index 0000000000..750c9de29f
--- /dev/null
+++ b/vpx_dsp/loongarch/avg_lsx.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h"
+
+void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride,
+                          tran_low_t *dst) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  ptrdiff_t src_stride2 = src_stride << 1;
+  ptrdiff_t src_stride3 = src_stride2 + src_stride;
+  ptrdiff_t src_stride4 = src_stride2 << 1;
+  ptrdiff_t src_stride6 = src_stride3 << 1;
+
+  int16_t *src_tmp = (int16_t *)src;
+  src0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2);
+  src3 = __lsx_vldx(src_tmp, src_stride6);
+  src_tmp += src_stride4;
+  src4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6);
+  src7 = __lsx_vldx(src_tmp, src_stride6);
+
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  store_tran_low(tmp0, dst, 0);
+  store_tran_low(tmp1, dst, 8);
+  store_tran_low(tmp2, dst, 16);
+  store_tran_low(tmp3, dst, 24);
+  store_tran_low(tmp4, dst, 32);
+  store_tran_low(tmp5, dst, 40);
+  store_tran_low(tmp6, dst, 48);
+  store_tran_low(tmp7, dst, 56);
+}
+
+void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride,
+                            tran_low_t *dst) {
+  int i;
+  __m128i a0, a1, a2, a3, b0, b1, b2, b3;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    a0 = load_tran_low(dst);
+    a1 = load_tran_low(dst + 64);
+    a2 = load_tran_low(dst + 128);
+    a3 = load_tran_low(dst + 192);
+
+    LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1);
+    DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3);
+    LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2);
+
+    store_tran_low(a0, dst, 0);
+    store_tran_low(a1, dst, 64);
+    store_tran_low(a2, dst, 128);
+    store_tran_low(a3, dst, 192);
+
+    dst += 8;
+  }
+}
diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
new file mode 100644
index 0000000000..4834f18fc0
--- /dev/null
+++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define load_tran_low(s)                      \
+  ({                                          \
+    __m128i res0_m;                           \
+    __m128i v0_m = __lsx_vld(s, 0);           \
+    __m128i v1_m = __lsx_vld(s + 4, 0);       \
+    res0_m = __lsx_vsrlni_h_w(v0_m, v1_m, 0); \
+    res0_m;                                   \
+  })
+
+#define store_tran_low(v, s, c)     \
+  {                                 \
+    __m128i v0_m, v1_m;             \
+    v1_m = __lsx_vexth_w_h(v);      \
+    v0_m = __lsx_vsllwil_w_h(v, 0); \
+    __lsx_vst(v0_m, s + c, 0);      \
+    __lsx_vst(v1_m, s + c + 4, 0);  \
+  }
+#else
+#define load_tran_low(s)      \
+  ({                          \
+    __m128i res0_m;           \
+    res0_m = __lsx_vld(s, 0); \
+    res0_m;                   \
+  })
+
+#define store_tran_low(v, s, c) __lsx_vst(v, s + c, 0)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index ddccfc1f4a..7de8b02055 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -339,6 +339,7 @@ DSP_SRCS-$(HAVE_AVX2)  += x86/avg_intrin_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_LSX)   += loongarch/avg_lsx.c
 ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
@@ -439,6 +440,9 @@ DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
 DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
 DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
 
+# LSX utilities
+DSP_SRCS-$(HAVE_LSX)  += loongarch/bitdepth_conversion_lsx.h
+
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
 
 DSP_SRCS-yes += vpx_dsp_rtcd.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b441b337b4..1c88dcdfa8 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -789,10 +789,10 @@ ()
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon vsx lsx/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_hadamard_32x32 sse2 avx2/;
@@ -813,10 +813,10 @@ ()
     specialize qw/vpx_highbd_satd avx2/;
   } else {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
     specialize qw/vpx_hadamard_32x32 sse2 avx2/;

From 65d9ac5b5a3dd1c72c15a1fc5bcc004a43ad4c90 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 12 Apr 2022 21:01:53 +0800
Subject: [PATCH 293/926] vp9[loongarch]: Optimize fdct4x4/8x8_lsx

1. vpx_fdct4x4_lsx
2. vpx_fdct8x8_lsx

Bug: webm:1755

Change-Id: If283fc08f9bedcbecd2c4052adb210f8fe00d4f0
---
 test/dct_test.cc                 |  6 +-
 test/fdct8x8_test.cc             |  7 +++
 vpx_dsp/loongarch/fwd_txfm_lsx.c | 92 +++++++++++++++++++++++++++++
 vpx_dsp/loongarch/fwd_txfm_lsx.h | 99 ++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl     |  4 +-
 5 files changed, 204 insertions(+), 4 deletions(-)

diff --git a/test/dct_test.cc b/test/dct_test.cc
index 6178f8e2cf..2182f87e5e 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -587,7 +587,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT,
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
 
 #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
-static const FuncInfo dct_lsx_func_info[2] = {
+static const FuncInfo dct_lsx_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_lsx>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+  { &fdct_wrapper<vpx_fdct8x8_lsx>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
   { &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
     16, 1 },
   { &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
@@ -596,7 +598,7 @@ static const FuncInfo dct_lsx_func_info[2] = {
 
 INSTANTIATE_TEST_SUITE_P(
     LSX, TransDCT,
-    ::testing::Combine(::testing::Range(0, 2),
+    ::testing::Combine(::testing::Range(0, 4),
                        ::testing::Values(dct_lsx_func_info),
                        ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
 #endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 0822666e70..83d1ff1429 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -768,4 +768,11 @@ INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT,
                                                       &vpx_idct8x8_64_add_vsx,
                                                       0, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_lsx,
+                                                      &vpx_idct8x8_64_add_c, 0,
+                                                      VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c
index 03f194b433..6f2d4d6fee 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.c
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -11,6 +11,20 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
 
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
+                                                                               \
+    DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
+    DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
+    _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
+    _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
+    DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
+    DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
+  }
+
 #if !CONFIG_VP9_HIGHBITDEPTH
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride) {
@@ -240,6 +254,84 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
   __lsx_vst(in7, output, 240);
 }
 
+void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  __m128i in0, in1, in2, in3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
+  in3 = __lsx_vldx(input, src_stride6);
+
+  /* fdct4 pre-process */
+  {
+    __m128i vec, mask;
+    __m128i zero = __lsx_vldi(0);
+
+    mask = __lsx_vinsgr2vr_b(zero, 1, 0);
+    DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
+              in3);
+    vec = __lsx_vseqi_h(in0, 0);
+    vec = __lsx_vxori_b(vec, 255);
+    vec = __lsx_vand_v(mask, vec);
+    in0 = __lsx_vadd_h(in0, vec);
+  }
+
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in2, output, 16);
+}
+
+void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+  int16_t *input_tmp = (int16_t *)input;
+
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
+            in2);
+  in3 = __lsx_vldx(input_tmp, src_stride6);
+  input_tmp += src_stride4;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
+            in6);
+  in7 = __lsx_vldx(input_tmp, src_stride6);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in1, output, 16);
+  __lsx_vst(in2, output, 32);
+  __lsx_vst(in3, output, 48);
+  __lsx_vst(in4, output, 64);
+  __lsx_vst(in5, output, 80);
+  __lsx_vst(in6, output, 96);
+  __lsx_vst(in7, output, 112);
+}
+
 void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
                        int32_t src_stride) {
   int32_t i;
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
index 9ed8102269..d04427a6ea 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.h
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -14,6 +14,105 @@
 #include "vpx_dsp/loongarch/txfm_macros_lsx.h"
 #include "vpx_dsp/txfm_common.h"
 
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                 \
+  {                                                                           \
+    __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m;                               \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                   \
+    __m128i vec4_m, vec5_m, vec6_m, vec7_m;                                   \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df };             \
+                                                                              \
+    LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);    \
+    DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m);    \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m);    \
+    cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m);                              \
+    vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
+                                                                              \
+    vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m);                                 \
+    cnst2_m = __lsx_vreplvei_h(coeff_m, 2);                                   \
+    cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m);                              \
+    vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
+                                                                              \
+    DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m,     \
+              vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
+              vec7_m, DCT_CONST_BITS, out0, out2, out1, out3);                \
+  }
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                  out3, out4, out5, out6, out7)                             \
+  {                                                                         \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                       \
+    __m128i s7_m, x0_m, x1_m, x2_m, x3_m;                                   \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };           \
+                                                                            \
+    /* FDCT stage1 */                                                       \
+    LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m,   \
+                      s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                  \
+    LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);      \
+    DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);           \
+    DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);           \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);        \
+    x1_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                          \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);        \
+    x2_m = __lsx_vneg_h(x2_m);                                              \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                          \
+                                                                            \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 2);                                    \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                          \
+                                                                            \
+    /* stage2 */                                                            \
+    s1_m = __lsx_vilvl_h(s5_m, s6_m);                                       \
+    s0_m = __lsx_vilvh_h(s5_m, s6_m);                                       \
+                                                                            \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                          \
+                                                                            \
+    /* stage3 */                                                            \
+    LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);      \
+                                                                            \
+    /* stage4 */                                                            \
+    DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);           \
+    DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);           \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);        \
+    x1_m = __lsx_vpackev_h(x0_m, x1_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                          \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);        \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                          \
+                                                                            \
+    x1_m = __lsx_vreplvei_h(coeff_m, 5);                                    \
+    x0_m = __lsx_vneg_h(x0_m);                                              \
+    x0_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 6);                                    \
+    x3_m = __lsx_vneg_h(x3_m);                                              \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                          \
+  }
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)             \
+  {                                                                         \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+                                                                            \
+    DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m,    \
+              vec1_m, vec2_m, vec3_m);                                      \
+    DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m,    \
+              vec5_m, vec6_m, vec7_m);                                      \
+    DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m,  \
+              in3, in0, in1, in2, in3);                                     \
+    DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m,  \
+              in7, in4, in5, in6, in7);                                     \
+  }
+
 #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
   {                                          \
     __m128i tp0_m, tp1_m;                    \
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 1c88dcdfa8..f17fc3b496 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -573,13 +573,13 @@ ()
   add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 } else {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 neon sse2 msa/;
+  specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+  specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
 
   add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;

From a44e61db29e510afe35448a0401c436bcfba3ec5 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Wed, 13 Apr 2022 16:46:22 +0800
Subject: [PATCH 294/926] vp9[loongarch]: Optimize
 avg_variance64x64/variance8x8

1. vpx_variance8x8_lsx
2. vpx_sub_pixel_avg_variance64x64_lsx

Bug: webm:1755

Change-Id: I7d68c7f2f5c8d27dc31cfd32298aeefb68f5d560
---
 test/variance_test.cc                      |   7 +-
 vpx_dsp/loongarch/sub_pixel_variance_lsx.c | 408 ++++++++++++++++++++-
 vpx_dsp/loongarch/variance_lsx.c           |  38 ++
 vpx_dsp/vpx_dsp_rtcd_defs.pl               |   4 +-
 4 files changed, 451 insertions(+), 6 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 11983bb8ac..a11ce25a63 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1655,10 +1655,15 @@ INSTANTIATE_TEST_SUITE_P(
     LSX, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx),
                       VarianceParams(5, 5, &vpx_variance32x32_lsx),
-                      VarianceParams(4, 4, &vpx_variance16x16_lsx)));
+                      VarianceParams(4, 4, &vpx_variance16x16_lsx),
+                      VarianceParams(3, 3, &vpx_variance8x8_lsx)));
 
 INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelVarianceTest,
                          ::testing::Values(SubpelVarianceParams(
                              5, 5, &vpx_sub_pixel_variance32x32_lsx, 0)));
+
+INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest,
+                         ::testing::Values(SubpelAvgVarianceParams(
+                             6, 6, &vpx_sub_pixel_avg_variance64x64_lsx, 0)));
 #endif
 }  // namespace
diff --git a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
index 0a0486479a..c7d233af82 100644
--- a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
+++ b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
@@ -54,11 +54,76 @@ static const uint8_t bilinear_filters_lsx[8][2] = {
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
   (sse) - (((int64_t)(diff) * (diff)) >> (shift))
 
+static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t ht_cnt = 32;
+  uint32_t res;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i pred0, pred1, pred2, pred3, vec, vec_tmp;
+  __m128i avg0, avg1, avg2, avg3;
+  __m128i var = __lsx_vldi(0);
+
+  avg0 = var;
+  avg1 = var;
+  avg2 = var;
+  avg3 = var;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+              pred3, src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+              pred3, src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+  vec = __lsx_vhaddw_w_h(avg0, avg0);
+  vec_tmp = __lsx_vhaddw_w_h(avg1, avg1);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  vec_tmp = __lsx_vhaddw_w_h(avg2, avg2);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  vec_tmp = __lsx_vhaddw_w_h(avg3, avg3);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
 static uint32_t sub_pixel_sse_diff_16width_h_lsx(
     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   uint32_t loop_cnt = (height >> 2);
-  int32_t res;
+  uint32_t res;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
   __m128i dst0, dst1, dst2, dst3, filt0;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -134,7 +199,7 @@ static uint32_t sub_pixel_sse_diff_16width_v_lsx(
     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   uint32_t loop_cnt = (height >> 2);
-  int32_t res;
+  uint32_t res;
   __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
   __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -215,7 +280,7 @@ static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
     int32_t height, int32_t *diff) {
   uint32_t loop_cnt = (height >> 2);
-  int32_t res;
+  uint32_t res;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
   __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
   __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
@@ -308,7 +373,309 @@ static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
   return sse;
 }
 
+static uint32_t subpel_avg_ssediff_16w_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  uint32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  __m128i pred0, pred1, pred2, pred3, filt0, vec;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i mask = { 0x403030202010100, 0x807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3,
+              pred3, tmp0, tmp1, tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  uint32_t res;
+  __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+  __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1, vec, filt0;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    src += src_stride;
+    src2 = __lsx_vld(src, 0);
+    src += src_stride;
+    src3 = __lsx_vld(src, 0);
+    src += src_stride;
+    src4 = __lsx_vld(src, 0);
+    src += src_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    src0 = src4;
+    ref0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+              pred3, out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  uint32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1;
+  __m128i mask = { 0x403030202010100, 0x807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+  HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    ref0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+              pred3, out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
 
 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht)                              \
   uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx(                           \
@@ -346,3 +713,38 @@ static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
   }
 
 VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht)                           \
+  uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx(                           \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
+      int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
+                                                                              \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_avg_sse_diff_64width_hv_lsx(                         \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_64width_v_lsx(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_avg_sse_diff_64width_h_lsx(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr,       \
+                                          ref_stride, sec_pred, &diff);       \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
+  }
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64)
diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c
index 8f2ec0563f..5223e0f169 100644
--- a/vpx_dsp/loongarch/variance_lsx.c
+++ b/vpx_dsp/loongarch/variance_lsx.c
@@ -43,6 +43,42 @@
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
   (sse) - (((int64_t)(diff) * (diff)) >> (shift))
 
+static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  int32_t ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0,
+              src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+              src2, src3);
+    src_ptr += src_stride4;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0,
+              ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1,
+              ref2, ref3);
+    ref_ptr += ref_stride4;
+
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
 static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
                                      const uint8_t *ref_ptr, int32_t ref_stride,
                                      int32_t height, int32_t *diff) {
@@ -174,6 +210,7 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
   return HADD_SW_S32(var);
 }
 
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
 
 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
@@ -191,6 +228,7 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
   }
 
+VPX_VARIANCE_WDXHT_LSX(8, 8)
 VPX_VARIANCE_WDXHT_LSX(16, 16)
 VPX_VARIANCE_WDXHT_LSX(32, 32)
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index f17fc3b496..706af97e50 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1108,7 +1108,7 @@ ()
   specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/;
@@ -1192,7 +1192,7 @@ ()
   specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;

From ca89bed50dbc5fe2abef50c5f36924bb1da6d1f6 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Sun, 15 May 2022 22:15:29 -0700
Subject: [PATCH 295/926] vp9-rtc: Fix to usage of active_maps when aq_mode=0

If aq_mode=0 the segmentation feature may still be used
for active_maps, so the condition active_maps.enabled
needs to be added in two places regarding segmentation
logic in encodeframe.c. Otherwise the active_maps would
have no effect.

This also resolves why the assert in bug webm:1762 was
not triggered when aq_mode=0.

Change-Id: Ibd68e9b5c3f81728241a168d3fb3567d6845633d
---
 vp9/encoder/vp9_encodeframe.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 9da8f61e32..5f08fa6f60 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -217,8 +217,8 @@ static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
       break;
   }
 
-  // Set segment index from ROI map if it's enabled.
-  if (cpi->roi.enabled)
+  // Set segment index if ROI map or active_map is enabled.
+  if (cpi->roi.enabled || cpi->active_map.enabled)
     mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
 
   vp9_init_plane_quantizers(cpi, x);
@@ -2499,7 +2499,8 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   *(xd->mi[0]) = ctx->mic;
   *(x->mbmi_ext) = ctx->mbmi_ext;
 
-  if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) {
+  if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled ||
+                       cpi->active_map.enabled)) {
     // Setting segmentation map for cyclic_refresh.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
         cpi->cyclic_refresh->content_mode) {

From 8486953e5e0de3cec0332b787aa05a7405e3c207 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Fri, 18 Mar 2022 09:33:53 +0800
Subject: [PATCH 296/926] vp8[loongarch]: Optimize vp8 encoding partial
 function

1. vp8_short_fdct4x4
2. vp8_regular_quantize_b
3. vp8_block_error
4. vp8_mbblock_error
5. vpx_subtract_block

Bug: webm:1755

Change-Id: I3dbfc7e3937af74090fc53fb4c9664e6cdda29ef
---
 test/quantize_test.cc                 |   7 +
 test/vp8_fdct4x4_test.cc              |   5 +
 test/vp9_subtract_test.cc             |   5 +
 vp8/common/rtcd_defs.pl               |   8 +-
 vp8/encoder/loongarch/dct_lsx.c       |  99 +++++++
 vp8/encoder/loongarch/encodeopt_lsx.c |  82 ++++++
 vp8/encoder/loongarch/quantize_lsx.c  | 145 ++++++++++
 vp8/vp8cx.mk                          |   5 +
 vpx_dsp/loongarch/subtract_lsx.c      | 371 ++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk                    |   2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl          |   2 +-
 11 files changed, 726 insertions(+), 5 deletions(-)
 create mode 100644 vp8/encoder/loongarch/dct_lsx.c
 create mode 100644 vp8/encoder/loongarch/encodeopt_lsx.c
 create mode 100644 vp8/encoder/loongarch/quantize_lsx.c
 create mode 100644 vpx_dsp/loongarch/subtract_lsx.c

diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index 792b21432e..57309e8102 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -224,4 +224,11 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
         make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, QuantizeTest,
+    ::testing::Values(make_tuple(&vp8_regular_quantize_b_lsx,
+                                 &vp8_regular_quantize_b_c)));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index 3e4305be73..1b73a72a01 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -203,4 +203,9 @@ INSTANTIATE_TEST_SUITE_P(MSA, FdctTest,
 INSTANTIATE_TEST_SUITE_P(MMI, FdctTest,
                          ::testing::Values(vp8_short_fdct4x4_mmi));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, FdctTest,
+                         ::testing::Values(vp8_short_fdct4x4_lsx));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index ef8cc207d6..211cc6c7ad 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -152,4 +152,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_vsx));
 #endif
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_lsx));
+#endif
+
 }  // namespace vp9
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index e4b40fa9ed..4f45d2ab9a 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -184,7 +184,7 @@ ()
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
 specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
@@ -196,7 +196,7 @@ ()
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi lsx/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
@@ -205,10 +205,10 @@ ()
 # Block subtraction
 #
 add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
-specialize qw/vp8_block_error sse2 msa/;
+specialize qw/vp8_block_error sse2 msa lsx/;
 
 add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
-specialize qw/vp8_mbblock_error sse2 msa/;
+specialize qw/vp8_mbblock_error sse2 msa lsx/;
 
 add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
 specialize qw/vp8_mbuverror sse2 msa/;
diff --git a/vp8/encoder/loongarch/dct_lsx.c b/vp8/encoder/loongarch/dct_lsx.c
new file mode 100644
index 0000000000..e090d2360f
--- /dev/null
+++ b/vp8/encoder/loongarch/dct_lsx.c
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
+                                                                               \
+    DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
+    DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
+    _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
+    _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
+    DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
+    DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
+  }
+
+#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)           \
+  {                                                                        \
+    __m128i tmp0_m, tmp1_m, tmp2_m;                                        \
+                                                                           \
+    tmp0_m = __lsx_vreplvei_h(coeff, val0);                                \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \
+    DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1,     \
+              const2);                                                     \
+  }
+
+#define RET_1_IF_NZERO_H(_in)           \
+  ({                                    \
+    __m128i tmp_m;                      \
+    __m128i one_m = __lsx_vldi(0x401);  \
+    __m128i max_m = __lsx_vldi(0xFF);   \
+                                        \
+    tmp_m = __lsx_vseqi_h(_in, 0);      \
+    tmp_m = __lsx_vxor_v(tmp_m, max_m); \
+    tmp_m = __lsx_vand_v(tmp_m, one_m); \
+                                        \
+    tmp_m;                              \
+  })
+
+void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3;
+  __m128i tmp0, tmp1, tmp2, tmp3, const0, const1;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i out0, out1, out2, out3;
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1,
+            in3);
+  in0 = __lsx_vadd_h(tmp0, tmp1);
+  in2 = __lsx_vsub_h(tmp0, tmp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
+  tmp0 = __lsx_vilvl_h(in3, in1);
+  in1 = __lsx_vreplvei_h(coeff, 3);
+  out0 = __lsx_vpackev_h(zero, in1);
+  coeff = __lsx_vilvl_h(zero, coeff);
+  out1 = __lsx_vreplvei_w(coeff, 0);
+  DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0,
+            out1);
+  DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
+  tmp2 = __lsx_vadd_h(tmp0, tmp1);
+  tmp3 = __lsx_vsub_h(tmp0, tmp1);
+  DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2);
+  DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2);
+  DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2);
+  tmp1 = RET_1_IF_NZERO_H(in3);
+  DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1);
+  out3 = __lsx_vadd_w(out3, out1);
+  out1 = __lsx_vreplvei_w(coeff, 1);
+  DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1,
+            out3);
+  DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3);
+  out1 = __lsx_vadd_w(out1, tmp1);
+  DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2);
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in2, output, 16);
+}
diff --git a/vp8/encoder/loongarch/encodeopt_lsx.c b/vp8/encoder/loongarch/encodeopt_lsx.c
new file mode 100644
index 0000000000..4ad4caba60
--- /dev/null
+++ b/vp8/encoder/loongarch/encodeopt_lsx.c
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp8/encoder/block.h"
+
+int32_t vp8_block_error_lsx(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) {
+  int32_t err = 0;
+  __m128i dq_coeff0, dq_coeff1, coeff0, coeff1;
+  __m128i reg0, reg1, reg2, reg3, error;
+
+  DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, dq_coeff_ptr, 0,
+            dq_coeff_ptr, 16, coeff0, coeff1, dq_coeff0, dq_coeff1);
+  DUP2_ARG2(__lsx_vsubwev_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg0,
+            reg2);
+  DUP2_ARG2(__lsx_vsubwod_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg1,
+            reg3);
+  error = __lsx_vmul_w(reg0, reg0);
+  DUP2_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, error);
+  error = __lsx_vmadd_w(error, reg3, reg3);
+  error = __lsx_vhaddw_d_w(error, error);
+  err = __lsx_vpickve2gr_w(error, 0);
+  err += __lsx_vpickve2gr_w(error, 2);
+  return err;
+}
+
+int32_t vp8_mbblock_error_lsx(MACROBLOCK *mb, int32_t dc) {
+  BLOCK *be;
+  BLOCKD *bd;
+  int16_t *coeff, *dq_coeff;
+  int32_t err = 0;
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, error;
+  __m128i mask0 = __lsx_vldi(0xFF);
+  __m128i zero = __lsx_vldi(0);
+
+  if (dc == 1) {
+    mask0 = __lsx_vinsgr2vr_w(mask0, 0, 0);
+  }
+
+  for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) {
+    int32_t loop_tmp = loop_cnt << 1;
+    be = &mb->block[loop_tmp];
+    bd = &mb->e_mbd.block[loop_tmp];
+    coeff = be->coeff;
+    dq_coeff = bd->dqcoeff;
+    DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src0,
+              src1, tmp0, tmp1);
+    be = &mb->block[loop_tmp + 1];
+    bd = &mb->e_mbd.block[loop_tmp + 1];
+    coeff = be->coeff;
+    dq_coeff = bd->dqcoeff;
+    DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src2,
+              src3, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsubwev_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vsubwod_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3,
+              reg1, reg3, reg5, reg7);
+    DUP2_ARG3(__lsx_vbitsel_v, zero, reg0, mask0, zero, reg4, mask0, reg0,
+              reg4);
+    error = __lsx_vmul_w(reg0, reg0);
+    DUP4_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, reg3,
+              reg3, error, reg4, reg4, error, error, error, error);
+    DUP2_ARG3(__lsx_vmadd_w, error, reg5, reg5, error, reg6, reg6, error,
+              error);
+    error = __lsx_vmadd_w(error, reg7, reg7);
+    error = __lsx_vhaddw_d_w(error, error);
+    error = __lsx_vhaddw_q_d(error, error);
+    err += __lsx_vpickve2gr_w(error, 0);
+  }
+  return err;
+}
diff --git a/vp8/encoder/loongarch/quantize_lsx.c b/vp8/encoder/loongarch/quantize_lsx.c
new file mode 100644
index 0000000000..75889192a7
--- /dev/null
+++ b/vp8/encoder/loongarch/quantize_lsx.c
@@ -0,0 +1,145 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp8/encoder/block.h"
+
+#define BOOST_QUANT1(_in0, _in1, _in2, _ui)               \
+  {                                                       \
+    if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \
+      if (__lsx_vpickve2gr_h(_in1, _ui)) {                \
+        eob = _ui;                                        \
+        boost_temp = zbin_boost;                          \
+      } else {                                            \
+        boost_temp++;                                     \
+      }                                                   \
+    } else {                                              \
+      _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui);             \
+      boost_temp++;                                       \
+    }                                                     \
+  }
+
+#define BOOST_QUANT2(_in0, _in1, _in2, _ui)               \
+  {                                                       \
+    if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \
+      if (__lsx_vpickve2gr_h(_in1, _ui)) {                \
+        eob = _ui + 8;                                    \
+        boost_temp = zbin_boost;                          \
+      } else {                                            \
+        boost_temp++;                                     \
+      }                                                   \
+    } else {                                              \
+      _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui);             \
+      boost_temp++;                                       \
+    }                                                     \
+  }
+
+static int8_t exact_regular_quantize_b_lsx(
+    int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round,
+    int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in,
+    int16_t *q_coeff, int16_t *dq_coeff) {
+  int32_t eob;
+  int16_t *boost_temp = zbin_boost;
+  __m128i inv_zig_zag = { 0x0C07040206050100, 0x0F0E0A090D0B0803 };
+  __m128i sign_z0, sign_z1, q_coeff0, q_coeff1;
+  __m128i z_bin0, z_bin1, zbin_o_q, x0, x1, sign_x0, sign_x1, de_quant0,
+      de_quant1;
+  __m128i z0, z1, round0, round1, quant0, quant2;
+  __m128i inv_zig_zag0, inv_zig_zag1;
+  __m128i zigzag_mask0 = { 0x0008000400010000, 0x0006000300020005 };
+  __m128i zigzag_mask1 = { 0x000A000D000C0009, 0X000F000E000B0007 };
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i zero = __lsx_vldi(0);
+
+  zbin_o_q = __lsx_vreplgr2vr_h(zbin_oq_in);
+  inv_zig_zag0 = __lsx_vilvl_b(zero, inv_zig_zag);
+  inv_zig_zag1 = __lsx_vilvh_b(zero, inv_zig_zag);
+  eob = -1;
+  DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, round, 0, round, 16, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0,
+            zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, z0, z1, round0,
+            round1);
+  DUP4_ARG2(__lsx_vld, quant, 0, quant, 16, zbin, 0, zbin, 16, tmp0, tmp1, tmp2,
+            tmp3);
+  DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0,
+            zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, quant0, quant2,
+            z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vsrai_h, z0, 15, z1, 15, sign_z0, sign_z1);
+  DUP2_ARG2(__lsx_vadda_h, z0, zero, z1, zero, x0, x1);
+  DUP2_ARG2(__lsx_vsub_h, x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vsub_h, z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vmulwev_w_h, quant0, round0, quant2, round1, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vmulwod_w_h, quant0, round0, quant2, round1, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2);
+  DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3);
+  DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, q_coeff0, q_coeff1);
+
+  DUP2_ARG2(__lsx_vld, quant_shift, 0, quant_shift, 16, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp3, tmp1, zigzag_mask1, tmp3, tmp1,
+            quant0, quant2);
+  DUP2_ARG2(__lsx_vadd_h, x0, round0, x1, round1, x0, x1);
+  DUP2_ARG2(__lsx_vmulwev_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vmulwod_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2);
+  DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3);
+  DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, x0, x1);
+  DUP2_ARG2(__lsx_vxor_v, x0, sign_z0, x1, sign_z1, sign_x0, sign_x1);
+  DUP2_ARG2(__lsx_vsub_h, sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
+
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 0);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 1);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 2);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 3);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 4);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 5);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 6);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 7);
+
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 0);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 1);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 2);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 3);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 4);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 5);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 6);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 7);
+
+  DUP2_ARG2(__lsx_vld, de_quant, 0, de_quant, 16, de_quant0, de_quant1);
+  DUP2_ARG3(__lsx_vshuf_h, inv_zig_zag0, sign_x1, sign_x0, inv_zig_zag1,
+            sign_x1, sign_x0, q_coeff0, q_coeff1);
+  DUP2_ARG2(__lsx_vmul_h, de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0,
+            de_quant1);
+  __lsx_vst(q_coeff0, q_coeff, 0);
+  __lsx_vst(q_coeff1, q_coeff, 16);
+  __lsx_vst(de_quant0, dq_coeff, 0);
+  __lsx_vst(de_quant1, dq_coeff, 16);
+
+  return (int8_t)(eob + 1);
+}
+
+void vp8_regular_quantize_b_lsx(BLOCK *b, BLOCKD *d) {
+  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+  int16_t *coeff_ptr = b->coeff;
+  int16_t *zbin_ptr = b->zbin;
+  int16_t *round_ptr = b->round;
+  int16_t *quant_ptr = b->quant;
+  int16_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  int16_t *dequant_ptr = d->dequant;
+  int16_t zbin_oq_value = b->zbin_extra;
+
+  *d->eob = exact_regular_quantize_b_lsx(
+      zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
+      quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr);
+}
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 3a8f8ea45a..5744cbabcc 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -124,4 +124,9 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 endif
 
+# common (loongarch LSX intrinsics)
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c
+
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/vpx_dsp/loongarch/subtract_lsx.c b/vpx_dsp/loongarch/subtract_lsx.c
new file mode 100644
index 0000000000..943a5c5a9b
--- /dev/null
+++ b/vpx_dsp/loongarch/subtract_lsx.c
@@ -0,0 +1,371 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3;
+  __m128i pred0, pred1, pred2, pred3;
+  __m128i diff0, diff1;
+  __m128i reg0, reg1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t diff_stride2 = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t diff_stride3 = diff_stride2 + diff_stride;
+
+  DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+            src2, src3);
+  DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+            pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2,
+            src0, src2, pred0, pred2);
+  DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0);
+  reg0 = __lsx_vilvl_b(src0, pred0);
+  reg1 = __lsx_vilvh_b(src0, pred0);
+  DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1);
+  __lsx_vstelm_d(diff0, diff_ptr, 0, 0);
+  __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1);
+  __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0);
+  __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1);
+}
+
+static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t dst_stride = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+            src2, src3);
+  DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+            pred1, pred2, pred3);
+  src_ptr += src_stride4;
+  pred_ptr += pred_stride4;
+
+  DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5,
+            src6, src7);
+  DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4,
+            pred5, pred6, pred7);
+
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  __lsx_vst(src0, diff_ptr, 0);
+  __lsx_vstx(src1, diff_ptr, dst_stride);
+  __lsx_vstx(src2, diff_ptr, dst_stride2);
+  __lsx_vstx(src3, diff_ptr, dst_stride3);
+  diff_ptr += dst_stride2;
+  __lsx_vst(src4, diff_ptr, 0);
+  __lsx_vstx(src5, diff_ptr, dst_stride);
+  __lsx_vstx(src6, diff_ptr, dst_stride2);
+  __lsx_vstx(src7, diff_ptr, dst_stride3);
+}
+
+static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t dst_stride = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+  int16_t *diff_tmp = diff + 8;
+
+  DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+            pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            pred, pred_stride, src5, src6, src7, pred5);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg2, reg4, reg6);
+  DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg1, reg3, reg5, reg7);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp0, tmp2, tmp4, tmp6);
+  DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp1, tmp3, tmp5, tmp7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+            pred0, pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+            pred4, pred5, pred6, pred7);
+  __lsx_vst(src0, diff, 0);
+  __lsx_vstx(src2, diff, dst_stride);
+  __lsx_vstx(src4, diff, dst_stride2);
+  __lsx_vstx(src6, diff, dst_stride3);
+  __lsx_vst(src1, diff_tmp, 0);
+  __lsx_vstx(src3, diff_tmp, dst_stride);
+  __lsx_vstx(src5, diff_tmp, dst_stride2);
+  __lsx_vstx(src7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  __lsx_vst(pred0, diff, 0);
+  __lsx_vstx(pred2, diff, dst_stride);
+  __lsx_vstx(pred4, diff, dst_stride2);
+  __lsx_vstx(pred6, diff, dst_stride3);
+  __lsx_vst(pred1, diff_tmp, 0);
+  __lsx_vstx(pred3, diff_tmp, dst_stride);
+  __lsx_vstx(pred5, diff_tmp, dst_stride2);
+  __lsx_vstx(pred7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+            pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            pred, pred_stride, src5, src6, src7, pred5);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg2, reg4, reg6);
+  DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg1, reg3, reg5, reg7);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp0, tmp2, tmp4, tmp6);
+  DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp1, tmp3, tmp5, tmp7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+            pred0, pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+            pred4, pred5, pred6, pred7);
+  __lsx_vst(src0, diff, 0);
+  __lsx_vstx(src2, diff, dst_stride);
+  __lsx_vstx(src4, diff, dst_stride2);
+  __lsx_vstx(src6, diff, dst_stride3);
+  __lsx_vst(src1, diff_tmp, 0);
+  __lsx_vstx(src3, diff_tmp, dst_stride);
+  __lsx_vstx(src5, diff_tmp, dst_stride2);
+  __lsx_vstx(src7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  __lsx_vst(pred0, diff, 0);
+  __lsx_vstx(pred2, diff, dst_stride);
+  __lsx_vstx(pred4, diff, dst_stride2);
+  __lsx_vstx(pred6, diff, dst_stride3);
+  __lsx_vst(pred1, diff_tmp, 0);
+  __lsx_vstx(pred3, diff_tmp, dst_stride);
+  __lsx_vstx(pred5, diff_tmp, dst_stride2);
+  __lsx_vstx(pred7, diff_tmp, dst_stride3);
+}
+
+static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  uint32_t loop_cnt;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    const uint8_t *src_tmp = src + 16;
+    const uint8_t *pred_tmp = pred + 16;
+    DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1,
+              pred0, pred1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+    DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred,
+              pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3);
+    DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred,
+              pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7);
+    DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg1, reg3, reg5, reg7);
+    DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+              reg7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, pred0, pred1, pred2, pred3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+              tmp7, pred4, pred5, pred6, pred7);
+    src += src_stride4;
+    pred += pred_stride4;
+    __lsx_vst(src0, diff, 0);
+    __lsx_vst(src1, diff, 16);
+    __lsx_vst(src2, diff, 32);
+    __lsx_vst(src3, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(src4, diff, 0);
+    __lsx_vst(src5, diff, 16);
+    __lsx_vst(src6, diff, 32);
+    __lsx_vst(src7, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(pred0, diff, 0);
+    __lsx_vst(pred1, diff, 16);
+    __lsx_vst(pred2, diff, 32);
+    __lsx_vst(pred3, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(pred4, diff, 0);
+    __lsx_vst(pred5, diff, 16);
+    __lsx_vst(pred6, diff, 32);
+    __lsx_vst(pred7, diff, 48);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  uint32_t loop_cnt;
+
+  for (loop_cnt = 32; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1,
+              pred2, pred3);
+    src += src_stride;
+    pred += pred_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+              src7);
+    DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5,
+              pred6, pred7);
+    src += src_stride;
+    pred += pred_stride;
+
+    DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg1, reg3, reg5, reg7);
+    DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+              reg7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, pred0, pred1, pred2, pred3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+              tmp7, pred4, pred5, pred6, pred7);
+    __lsx_vst(src0, diff, 0);
+    __lsx_vst(src1, diff, 16);
+    __lsx_vst(src2, diff, 32);
+    __lsx_vst(src3, diff, 48);
+    __lsx_vst(src4, diff, 64);
+    __lsx_vst(src5, diff, 80);
+    __lsx_vst(src6, diff, 96);
+    __lsx_vst(src7, diff, 112);
+    diff += diff_stride;
+    __lsx_vst(pred0, diff, 0);
+    __lsx_vst(pred1, diff, 16);
+    __lsx_vst(pred2, diff, 32);
+    __lsx_vst(pred3, diff, 48);
+    __lsx_vst(pred4, diff, 64);
+    __lsx_vst(pred5, diff, 80);
+    __lsx_vst(pred6, diff, 96);
+    __lsx_vst(pred7, diff, 112);
+    diff += diff_stride;
+  }
+}
+
+void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr,
+                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                            ptrdiff_t pred_stride) {
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 8:
+        sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 16:
+        sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 32:
+        sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 64:
+        sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+                             src_stride, pred_ptr, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+                         pred_ptr, pred_stride);
+  }
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 7de8b02055..9d8c94545d 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -382,6 +382,8 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
 DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
 DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 706af97e50..1ef99e641d 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -730,7 +730,7 @@ ()
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/;
 
 #
 # Single block SAD

From bfbb79e252b9102ca5ae3ad5ab605254ce6681d2 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Wed, 20 Apr 2022 11:13:13 +0800
Subject: [PATCH 297/926] vp8[loongarch]: Optimize sub_pixel_variance8x8/16x16

1. vpx_sub_pixel_variance8x8_lsx
1. vpx_sub_pixel_variance16x16_lsx
2. vpx_mse16x16_lsx

Bug: webm:1755

Change-Id: Iaedd8393c950c13042a0597d0d47b534a2723317
---
 test/variance_test.cc                      |  12 +-
 vpx_dsp/loongarch/sub_pixel_variance_lsx.c | 212 ++++++++++++++++-----
 vpx_dsp/loongarch/variance_lsx.c           | 102 +++++-----
 vpx_dsp/loongarch/variance_lsx.h           |  62 ++++++
 vpx_dsp/vpx_dsp.mk                         |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl               |   6 +-
 6 files changed, 301 insertions(+), 94 deletions(-)
 create mode 100644 vpx_dsp/loongarch/variance_lsx.h

diff --git a/test/variance_test.cc b/test/variance_test.cc
index a11ce25a63..80855052dc 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1651,6 +1651,9 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_MMI
 
 #if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_lsx)));
+
 INSTANTIATE_TEST_SUITE_P(
     LSX, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx),
@@ -1658,9 +1661,12 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(4, 4, &vpx_variance16x16_lsx),
                       VarianceParams(3, 3, &vpx_variance8x8_lsx)));
 
-INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelVarianceTest,
-                         ::testing::Values(SubpelVarianceParams(
-                             5, 5, &vpx_sub_pixel_variance32x32_lsx, 0)));
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_lsx, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_lsx, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_lsx, 0)));
 
 INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest,
                          ::testing::Values(SubpelAvgVarianceParams(
diff --git a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
index c7d233af82..700793531c 100644
--- a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
+++ b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
@@ -10,47 +10,17 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vpx_util/loongson_intrinsics.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
 #include "vpx_dsp/variance.h"
 
-#define HADD_SW_S32(in0, in1)                  \
-  do {                                         \
-    __m128i res0_m;                            \
-                                               \
-    res0_m = __lsx_vhaddw_d_w(in0, in0);       \
-    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
-    in1 = __lsx_vpickve2gr_w(res0_m, 0);       \
-  } while (0)
-
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \
-  do {                                                        \
-    __m128i tmp0_m, tmp1_m;                                   \
-                                                              \
-    tmp0_m = __lsx_vshuf_b(in1, in0, mask);                   \
-    tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);                  \
-    in2 = __lsx_vsrari_h(tmp1_m, shift);                      \
-  } while (0)
-
-#define CALC_MSE_AVG_B(src, ref, var, sub)                                \
-  {                                                                       \
-    __m128i src_l0_m, src_l1_m;                                           \
-    __m128i res_l0_m, res_l1_m;                                           \
-                                                                          \
-    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
-    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
-    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
-              res_l0_m, res_l1_m);                                        \
-    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
-    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
-    sub = __lsx_vadd_h(sub, res_l0_m);                                    \
-    sub = __lsx_vadd_h(sub, res_l1_m);                                    \
-  }
-
 static const uint8_t bilinear_filters_lsx[8][2] = {
   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
   (sse) - (((int64_t)(diff) * (diff)) >> (shift))
 
@@ -59,8 +29,7 @@ static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
                                        const uint8_t *ref_ptr,
                                        int32_t ref_stride,
                                        const uint8_t *sec_pred, int32_t *diff) {
-  int32_t ht_cnt = 32;
-  uint32_t res;
+  int32_t res, ht_cnt = 32;
   __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
   __m128i pred0, pred1, pred2, pred3, vec, vec_tmp;
   __m128i avg0, avg1, avg2, avg3;
@@ -119,11 +88,58 @@ static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
   return res;
 }
 
+static uint32_t sub_pixel_sse_diff_8width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i vec0, vec1, vec2, vec3, filt0, out, vec;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1,
+              FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS,
+              src0, src1, src2, src3);
+    out = __lsx_vpackev_d(src1, src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = __lsx_vpackev_d(src3, src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
 static uint32_t sub_pixel_sse_diff_16width_h_lsx(
     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   uint32_t loop_cnt = (height >> 2);
-  uint32_t res;
+  int32_t res;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
   __m128i dst0, dst1, dst2, dst3, filt0;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -172,7 +188,6 @@ static uint32_t sub_pixel_sse_diff_16width_h_lsx(
   vec = __lsx_vhaddw_w_h(avg, avg);
   HADD_SW_S32(vec, *diff);
   HADD_SW_S32(var, res);
-
   return res;
 }
 
@@ -195,11 +210,59 @@ static uint32_t sub_pixel_sse_diff_32width_h_lsx(
   return sse;
 }
 
+static uint32_t sub_pixel_sse_diff_8width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+  __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
 static uint32_t sub_pixel_sse_diff_16width_v_lsx(
     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   uint32_t loop_cnt = (height >> 2);
-  uint32_t res;
+  int32_t res;
   __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
   __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -252,7 +315,6 @@ static uint32_t sub_pixel_sse_diff_16width_v_lsx(
   vec = __lsx_vhaddw_w_h(avg, avg);
   HADD_SW_S32(vec, *diff);
   HADD_SW_S32(var, res);
-
   return res;
 }
 
@@ -275,12 +337,70 @@ static uint32_t sub_pixel_sse_diff_32width_v_lsx(
   return sse;
 }
 
+static uint32_t sub_pixel_sse_diff_8width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3);
+    src += src_stride;
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
 static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
     int32_t height, int32_t *diff) {
   uint32_t loop_cnt = (height >> 2);
-  uint32_t res;
+  int32_t res;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
   __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
   __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
@@ -378,7 +498,7 @@ static uint32_t subpel_avg_ssediff_16w_h_lsx(
     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
     int32_t height, int32_t *diff, int32_t width) {
   uint32_t loop_cnt = (height >> 2);
-  uint32_t res;
+  int32_t res;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
   __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
   __m128i pred0, pred1, pred2, pred3, filt0, vec;
@@ -450,7 +570,7 @@ static uint32_t subpel_avg_ssediff_16w_v_lsx(
     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
     int32_t height, int32_t *diff, int32_t width) {
   uint32_t loop_cnt = (height >> 2);
-  uint32_t res;
+  int32_t res;
   __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
   __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -527,7 +647,7 @@ static uint32_t subpel_avg_ssediff_16w_hv_lsx(
     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
     const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
   uint32_t loop_cnt = (height >> 2);
-  uint32_t res;
+  int32_t res;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
   __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
   __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
@@ -674,6 +794,8 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
   return sse;
 }
 
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
 
@@ -712,6 +834,8 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
     return var;                                                               \
   }
 
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16)
 VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
 
 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht)                           \
diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c
index 5223e0f169..8fad342c71 100644
--- a/vpx_dsp/loongarch/variance_lsx.c
+++ b/vpx_dsp/loongarch/variance_lsx.c
@@ -9,33 +9,7 @@
  */
 
 #include "./vpx_dsp_rtcd.h"
-#include "vpx_util/loongson_intrinsics.h"
-
-#define HADD_SW_S32(in)                        \
-  ({                                           \
-    __m128i res0_m;                            \
-    int32_t sum_m;                             \
-                                               \
-    res0_m = __lsx_vhaddw_d_w(in, in);         \
-    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
-    sum_m = __lsx_vpickve2gr_w(res0_m, 0);     \
-    sum_m;                                     \
-  })
-
-#define CALC_MSE_AVG_B(src, ref, var, sub)                                \
-  {                                                                       \
-    __m128i src_l0_m, src_l1_m;                                           \
-    __m128i res_l0_m, res_l1_m;                                           \
-                                                                          \
-    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
-    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
-    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
-              res_l0_m, res_l1_m);                                        \
-    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
-    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
-    sub = __lsx_vadd_h(sub, res_l0_m);                                    \
-    sub = __lsx_vadd_h(sub, res_l1_m);                                    \
-  }
+#include "vpx_dsp/loongarch/variance_lsx.h"
 
 #define VARIANCE_WxH(sse, diff, shift) \
   (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
@@ -46,7 +20,7 @@
 static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride,
                                     const uint8_t *ref_ptr, int32_t ref_stride,
                                     int32_t height, int32_t *diff) {
-  int32_t ht_cnt = (height >> 2);
+  int32_t res, ht_cnt = (height >> 2);
   __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec;
   __m128i avg = __lsx_vldi(0);
   __m128i var = avg;
@@ -74,15 +48,15 @@ static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride,
   }
 
   vec = __lsx_vhaddw_w_h(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
 }
 
 static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
                                      const uint8_t *ref_ptr, int32_t ref_stride,
                                      int32_t height, int32_t *diff) {
-  int32_t ht_cnt = (height >> 2);
+  int32_t res, ht_cnt = (height >> 2);
   __m128i src, ref, vec;
   __m128i avg = __lsx_vldi(0);
   __m128i var = avg;
@@ -112,15 +86,15 @@ static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
     CALC_MSE_AVG_B(src, ref, var, avg);
   }
   vec = __lsx_vhaddw_w_h(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
 }
 
 static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
                                      const uint8_t *ref_ptr, int32_t ref_stride,
                                      int32_t height, int32_t *diff) {
-  int32_t ht_cnt = (height >> 2);
+  int32_t res, ht_cnt = (height >> 2);
   __m128i avg = __lsx_vldi(0);
   __m128i src0, src1, ref0, ref1;
   __m128i vec;
@@ -157,15 +131,15 @@ static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
   }
 
   vec = __lsx_vhaddw_w_h(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
 }
 
 static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
                                    const uint8_t *ref_ptr, int32_t ref_stride,
                                    int32_t *diff) {
-  int32_t ht_cnt = 32;
+  int32_t res, ht_cnt = 32;
   __m128i avg0 = __lsx_vldi(0);
   __m128i src0, src1, src2, src3;
   __m128i ref0, ref1, ref2, ref3;
@@ -205,12 +179,12 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
   vec0 = __lsx_vadd_w(vec0, vec1);
   vec1 = __lsx_vhaddw_w_h(avg3, avg3);
   vec0 = __lsx_vadd_w(vec0, vec1);
-  *diff = HADD_SW_S32(vec0);
-
-  return HADD_SW_S32(var);
+  HADD_SW_S32(vec0, *diff);
+  HADD_SW_S32(var, res);
+  return res;
 }
 
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
 
 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
@@ -228,6 +202,38 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
   }
 
+static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src, ref;
+  __m128i var = __lsx_vldi(0);
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+  }
+  HADD_SW_S32(var, res);
+  return res;
+}
+
 VPX_VARIANCE_WDXHT_LSX(8, 8)
 VPX_VARIANCE_WDXHT_LSX(16, 16)
 VPX_VARIANCE_WDXHT_LSX(32, 32)
@@ -242,6 +248,14 @@ uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
   return VARIANCE_64Wx64H(*sse, diff);
 }
 
+uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride,
+                          const uint8_t *ref, int32_t ref_stride,
+                          uint32_t *sse) {
+  *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
 void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride,
                          const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
                          int32_t *sum) {
diff --git a/vpx_dsp/loongarch/variance_lsx.h b/vpx_dsp/loongarch/variance_lsx.h
new file mode 100644
index 0000000000..cf9e9890ff
--- /dev/null
+++ b/vpx_dsp/loongarch/variance_lsx.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define HADD_SW_S32(in0, in1)                  \
+  do {                                         \
+    __m128i res0_m;                            \
+                                               \
+    res0_m = __lsx_vhaddw_d_w(in0, in0);       \
+    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+    in1 = __lsx_vpickve2gr_w(res0_m, 0);       \
+  } while (0)
+
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \
+  do {                                                        \
+    __m128i tmp0_m, tmp1_m;                                   \
+                                                              \
+    tmp0_m = __lsx_vshuf_b(in1, in0, mask);                   \
+    tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);                  \
+    in2 = __lsx_vsrari_h(tmp1_m, shift);                      \
+  } while (0)
+
+#define CALC_MSE_B(src, ref, var)                                         \
+  do {                                                                    \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+  } while (0)
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                                \
+  do {                                                                    \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+    sub = __lsx_vadd_h(sub, res_l0_m);                                    \
+    sub = __lsx_vadd_h(sub, res_l1_m);                                    \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 9d8c94545d..4f5a7a1908 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -402,6 +402,7 @@ DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.h
 DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/sub_pixel_variance_lsx.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/avg_pred_lsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 1ef99e641d..23925a4793 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1129,7 +1129,7 @@ ()
   specialize qw/vpx_get8x8var sse2 neon msa vsx/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/;
@@ -1171,7 +1171,7 @@ ()
   specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
@@ -1180,7 +1180,7 @@ ()
   specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;

From 508c0aff89b511d04cbd1e782cc24313fd6ae06b Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Wed, 20 Apr 2022 11:16:55 +0800
Subject: [PATCH 298/926] vp8[loongarch]: Optimize fdct8x4/diamond_search_sad

1. vp8_short_fdct8x4_lsx
2. vp8_diamond_search_sad_lsx
3. vpx_sad8x8_lsx

Bug: webm:1755

Change-Id: Ic9df84ead2d4fc07ec58e9730d6a12ac2b2d31c1
---
 test/sad_test.cc                |  1 +
 vp8/common/rtcd_defs.pl         |  5 +--
 vp8/encoder/loongarch/dct_lsx.c | 62 +++++++++++++++++++++++++++++++++
 vp8/encoder/mcomp.c             |  4 +--
 vpx_dsp/loongarch/sad_lsx.c     | 36 ++++++++++++++++++-
 vpx_dsp/vpx_dsp_rtcd_defs.pl    |  2 +-
 6 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 7ce25343f6..2506f1adbc 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1136,6 +1136,7 @@ const SadMxNParam lsx_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_lsx),
   SadMxNParam(32, 32, &vpx_sad32x32_lsx),
   SadMxNParam(16, 16, &vpx_sad16x16_lsx),
+  SadMxNParam(8, 8, &vpx_sad8x8_lsx),
 };
 INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests));
 
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 4f45d2ab9a..7bc866faaa 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -187,7 +187,7 @@ ()
 specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
 specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
@@ -222,9 +222,10 @@ ()
 $vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
 
 add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_diamond_search_sad sse2 msa/;
+specialize qw/vp8_diamond_search_sad sse2 msa lsx/;
 $vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4;
 $vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4;
+$vp8_diamond_search_sad_lsx=vp8_diamond_search_sadx4;
 
 #
 # Alt-ref Noise Reduction (ARNR)
diff --git a/vp8/encoder/loongarch/dct_lsx.c b/vp8/encoder/loongarch/dct_lsx.c
index e090d2360f..a08d4d3f63 100644
--- a/vp8/encoder/loongarch/dct_lsx.c
+++ b/vp8/encoder/loongarch/dct_lsx.c
@@ -97,3 +97,65 @@ void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
   __lsx_vst(in0, output, 0);
   __lsx_vst(in2, output, 16);
 }
+
+void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1;
+  __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1,
+            in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
+  temp0 = __lsx_vreplvei_h(coeff, 3);
+  vec1_w = __lsx_vpackev_h(zero, temp0);
+  coeff = __lsx_vilvh_h(zero, coeff);
+  vec3_w = __lsx_vreplvei_w(coeff, 0);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in0 = __lsx_vaddi_hu(in0, 7);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  in2 = __lsx_vaddi_hu(in2, 7);
+  in0 = __lsx_vsrai_h(in0, 4);
+  in2 = __lsx_vsrai_h(in2, 4);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w);
+  vec3_w = __lsx_vadd_w(vec3_w, vec1_w);
+  vec1_w = __lsx_vreplvei_w(coeff, 1);
+  const0 = RET_1_IF_NZERO_H(in3);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3);
+  in1 = __lsx_vadd_h(in1, const0);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1);
+  __lsx_vst(temp0, output, 0);
+  __lsx_vst(temp1, output, 16);
+
+  DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in2, output, 48);
+}
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 769c2f5589..ae092c66e1 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1129,7 +1129,7 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
-#if HAVE_SSE2 || HAVE_MSA
+#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX
 int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                              int_mv *best_mv, int search_param, int sad_per_bit,
                              int *num00, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1278,7 +1278,7 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
-#endif  // HAVE_SSE2 || HAVE_MSA
+#endif  // HAVE_SSE2 || HAVE_MSA || HAVE_LSX
 
 int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                         int sad_per_bit, int distance,
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
index 4764acbf88..46ee557df5 100644
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -57,6 +57,34 @@
     sum_m;                                     \
   })
 
+static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
+    src += src_stride;
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  return HADD_UH_U32(sad);
+}
+
 static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
                                 const uint8_t *ref, int32_t ref_stride,
                                 int32_t height) {
@@ -584,6 +612,12 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
   return HADD_SW_S32(sad);
 }
 
+#define VPX_SAD_8xHT_LSX(height)                                             \
+  uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_8width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
 #define VPX_SAD_16xHT_LSX(height)                                             \
   uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
                                     const uint8_t *ref, int32_t ref_stride) { \
@@ -662,7 +696,7 @@ SAD32
 
 SAD16
 
-#define SAD8 VPX_SAD_8xHTx4D_LSX(8)
+#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
 
 SAD8
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 23925a4793..e82b487f13 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -763,7 +763,7 @@ ()
 specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/;

From 1c39c625264fa64db1c573cbac1f3a4f24c660d3 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Sun, 24 Apr 2022 10:34:21 +0800
Subject: [PATCH 299/926] vp8[loongarch]: Optimize vp8_sixtap_predict4x4

1. vp8_sixtap_predict4x4

Bug: webm:1755

Change-Id: If7d844496ef2cfe2252f2ef12bb7cded63ad03dd
---
 vp8/common/loongarch/sixtap_filter_lsx.c | 745 ++++++++++++++++++++++-
 vp8/common/rtcd_defs.pl                  |   2 +-
 2 files changed, 743 insertions(+), 4 deletions(-)

diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c
index 75fe533d98..a23ed16d2b 100644
--- a/vp8/common/loongarch/sixtap_filter_lsx.c
+++ b/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -50,9 +50,9 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
     __m128i vec0_m, vec1_m, vec2_m;                                         \
     __m128i hz_out_m;                                                       \
                                                                             \
-    DUP2_ARG3(__lsx_vshuf_b, src0, src1, mask0, src0, src1, mask1, vec0_m,  \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,  \
               vec1_m);                                                      \
-    vec2_m = __lsx_vshuf_b(src0, src1, mask2);                              \
+    vec2_m = __lsx_vshuf_b(src1, src0, mask2);                              \
     hz_out_m = DPADD_H3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \
                                                                             \
     hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
@@ -61,6 +61,24 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
     hz_out_m;                                                               \
   })
 
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   mask2, filt0, filt1, filt2, out0, out1) \
+  {                                                                        \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+              out0, out1);                                                 \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \
+              vec5_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
+              out0, out1);                                                 \
+  }
+
 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
                                    mask2, filt0, filt1, filt2, out0, out1,  \
                                    out2, out3)                              \
@@ -104,7 +122,7 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
     __m128i vec0_m, vec1_m;                                                \
     __m128i hz_out_m;                                                      \
                                                                            \
-    DUP2_ARG3(__lsx_vshuf_b, src0, src1, mask0, src0, src1, mask1, vec0_m, \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \
               vec1_m);                                                     \
     hz_out_m = FILT_4TAP_DPADD_H(vec0_m, vec1_m, filt_h0, filt_h1);        \
     hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);                 \
@@ -113,6 +131,20 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
     hz_out_m;                                                              \
   })
 
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1)               \
+  {                                                                        \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+              out0, out1);                                                 \
+  }
+
 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
                                    filt0, filt1, out0, out1, out2, out3)   \
   ({                                                                       \
@@ -133,6 +165,107 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
               out3);                                                       \
   })
 
+static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
+                                        int32_t src_stride,
+                                        uint8_t *RESTRICT dst,
+                                        int32_t dst_stride,
+                                        const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1);
+  out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+  out0 = __lsx_vxori_b(out0, 128);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out0, out1, out2, out3;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride_x2 << 1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src += src_stride_x4;
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out2, out3);
+
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
 static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter, int32_t height) {
@@ -254,6 +387,64 @@ static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
   }
 }
 
+static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+  __m128i out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+
+  DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+  src2 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_r, src65_r, src76_r, src87_r);
+    DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
+              src8776);
+    DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
+    out0 = DPADD_H3(src2110, src4332, src6554, filt0, filt1, filt2);
+    out1 = DPADD_H3(src4332, src6554, src8776, filt0, filt1, filt2);
+
+    out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+    out0 = __lsx_vxori_b(out0, 128);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src4 = src8;
+  }
+}
+
 static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter, int32_t height) {
@@ -395,6 +586,92 @@ static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
   }
 }
 
+static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+  src2 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+
+  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    src6 = __lsx_vldx(src, src_stride);
+    src += src_stride_x2;
+
+    DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+    hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+
+    src7 = __lsx_vld(src, 0);
+    src8 = __lsx_vldx(src, src_stride);
+    src += src_stride_x2;
+
+    DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
+    hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out3 = hz_out7;
+    out0 = out2;
+    out1 = out3;
+  }
+}
+
 static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                                      uint8_t *RESTRICT dst, int32_t dst_stride,
                                      const int8_t *filter_horiz,
@@ -506,6 +783,102 @@ static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                            filter_horiz, filter_vert, height);
 }
 
+static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out0, out1);
+
+  out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+  out0 = __lsx_vxori_b(out0, 128);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  src += src_stride_x4;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out0, out1);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
 static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter, int32_t height) {
@@ -597,6 +970,55 @@ static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
   }
 }
 
+static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+  __m128i src2110, src4332, filt0, filt1, out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src1 = __lsx_vld(src, 0);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+  src2110 = __lsx_vilvl_d(src21_r, src10_r);
+  src2110 = __lsx_vxori_b(src2110, 128);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src += src_stride_x3;
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = __lsx_vilvl_d(src43_r, src32_r);
+    src4332 = __lsx_vxori_b(src4332, 128);
+    out0 = FILT_4TAP_DPADD_H(src2110, src4332, filt0, filt1);
+
+    src2 = __lsx_vld(src, 0);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
+    src2110 = __lsx_vilvl_d(src65_r, src54_r);
+    src2110 = __lsx_vxori_b(src2110, 128);
+    out1 = FILT_4TAP_DPADD_H(src4332, src2110, filt0, filt1);
+    out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+    out0 = __lsx_vxori_b(out0, 128);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+  }
+}
+
 static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter, int32_t height) {
@@ -719,6 +1141,74 @@ static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
   }
 }
 
+static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+  __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src1 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src6 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+
+    DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+
+    DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+    hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out1 = hz_out5;
+    vec0 = vec2;
+  }
+}
+
 static inline void common_hv_4ht_4vt_8w_lsx(
     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
@@ -804,6 +1294,82 @@ static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                            filter_horiz, filter_vert, height);
 }
 
+static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  src1 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+
+  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src6 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+
+    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp1, dst, 0, 1);
+    dst += dst_stride;
+
+    hz_out1 = hz_out5;
+    vec0 = vec2;
+  }
+}
+
 static inline void common_hv_6ht_4vt_8w_lsx(
     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
@@ -895,6 +1461,82 @@ static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                            filter_horiz, filter_vert, height);
 }
 
+static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride,
+            src, src_stride_x2, src0, src1, src3, src4);
+  src2 = __lsx_vld(src, 0);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride_x3);
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    src += src_stride_x4;
+
+    hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+    out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out3 = hz_out7;
+    out0 = out2;
+    out1 = out3;
+  }
+}
+
 static inline void common_hv_4ht_6vt_8w_lsx(
     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
@@ -1000,6 +1642,103 @@ typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src,
                                        int32_t dst_stride, const int8_t *filter,
                                        int32_t height);
 
+void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = {
+    common_hv_6ht_6vt_4w_lsx,
+    common_hv_6ht_4vt_4w_lsx,
+    common_hv_4ht_6vt_4w_lsx,
+    common_hv_4ht_4vt_4w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx,
+                                                        common_vt_4t_4w_lsx,
+                                                        common_hz_6t_4w_lsx,
+                                                        common_hz_4t_4w_lsx };
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter, 4);
+                break;
+              case 1:
+                Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter + 1, 4);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict4x4Funcs1[2](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter, 4);
+                break;
+
+              case 1:
+                Predict4x4Funcs1[3](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter + 1, 4);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4);
+            break;
+
+          case 1:
+            Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+                                4);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset) {
+        case 0: {
+          __m128i tp0;
+          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+          src += src_stride;
+          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+          src += src_stride;
+          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+          src += src_stride;
+          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+          dst += dst_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 1);
+          dst += dst_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 2);
+          dst += dst_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 3);
+          break;
+        }
+        case 2:
+        case 4:
+        case 6:
+          Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4);
+          break;
+      }
+      switch (xoffset & 1) {
+        case 1:
+          Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                              4);
+          break;
+      }
+    }
+  }
+}
+
 void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
                                int32_t xoffset, int32_t yoffset,
                                uint8_t *RESTRICT dst, int32_t dst_stride) {
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 7bc866faaa..739a612847 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -155,7 +155,7 @@ ()
 specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;

From 17959f9c94be43de57972052ebfdc40870170b0e Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Fri, 6 May 2022 12:17:39 +0800
Subject: [PATCH 300/926] vp9[loongarch]: Optimize vpx_quantize_b/b_32x32

1. vpx_quantize_b_lsx
2. vpx_quantize_b_32x32_lsx

Bug: webm:1755

Change-Id: I476c8677a2c2aed7248e088e62c3777c9bed2adb
---
 test/vp9_quantize_test.cc        |  10 ++
 vpx_dsp/loongarch/quantize_lsx.c | 249 +++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk               |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl     |   4 +-
 4 files changed, 262 insertions(+), 2 deletions(-)
 create mode 100644 vpx_dsp/loongarch/quantize_lsx.c

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index d54f1bc9cd..ca1062a76f 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -568,6 +568,16 @@ INSTANTIATE_TEST_SUITE_P(
                                  VPX_BITS_8, 32, true)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
+                         ::testing::Values(make_tuple(&vpx_quantize_b_lsx,
+                                                      &vpx_quantize_b_c,
+                                                      VPX_BITS_8, 16, false),
+                                           make_tuple(&vpx_quantize_b_32x32_lsx,
+                                                      &vpx_quantize_b_32x32_c,
+                                                      VPX_BITS_8, 32, false)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 // Only useful to compare "Speed" test results.
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c
new file mode 100644
index 0000000000..e3fbb9e9e0
--- /dev/null
+++ b/vpx_dsp/loongarch/quantize_lsx.c
@@ -0,0 +1,249 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define CALCULATE_QCOEFF(coeff, coeff_abs, round, quant, shift, cmp_mask) \
+  ({                                                                      \
+    __m128i rounded, qcoeff;                                              \
+                                                                          \
+    rounded = __lsx_vsadd_h(coeff_abs, round);                            \
+    qcoeff = __lsx_vmuh_h(rounded, quant);                                \
+    qcoeff = __lsx_vadd_h(rounded, qcoeff);                               \
+    qcoeff = __lsx_vmuh_h(qcoeff, shift);                                 \
+    qcoeff = __lsx_vsigncov_h(coeff, qcoeff);                             \
+    qcoeff = __lsx_vand_v(qcoeff, cmp_mask);                              \
+                                                                          \
+    qcoeff;                                                               \
+  })
+
+#define CALCULATE_DQCOEFF_AND_STORE(qcoeff, dequant, dqcoeff) \
+  {                                                           \
+    __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);        \
+    __lsx_vst(dqcoeff16, dqcoeff, 0);                         \
+  }
+
+#define CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff, dequant, dqcoeff) \
+  {                                                                 \
+    __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;               \
+    __m128i zero = __lsx_vldi(0);                                   \
+    __m128i coeff = __lsx_vabsd_h(qcoeff, zero);                    \
+                                                                    \
+    __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);                   \
+    __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);                   \
+                                                                    \
+    low = __lsx_vmul_h(coeff, dequant);                             \
+    high = __lsx_vmuh_h(coeff, dequant);                            \
+    dqcoeff32_0 = __lsx_vilvl_h(high, low);                         \
+    dqcoeff32_1 = __lsx_vilvh_h(high, low);                         \
+                                                                    \
+    dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);                    \
+    dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);                    \
+    dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);            \
+    dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);            \
+    res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);                \
+    __lsx_vst(res, dqcoeff, 0);                                     \
+  }
+
+#define SCAN_FOR_EOB(coeff0, coeff1, zbin_mask0, zbin_mask1, scan, index, \
+                     zero)                                                \
+  ({                                                                      \
+    __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);                     \
+    __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);                     \
+    __m128i scan0 = __lsx_vld(scan + index, 0);                           \
+    __m128i scan1 = __lsx_vld(scan + index + 8, 0);                       \
+    __m128i eob0, eob1, eob_max;                                          \
+                                                                          \
+    scan0 = __lsx_vsub_h(scan0, zbin_mask0);                              \
+    scan1 = __lsx_vsub_h(scan1, zbin_mask1);                              \
+    eob0 = __lsx_vandn_v(zero_coeff0, scan0);                             \
+    eob1 = __lsx_vandn_v(zero_coeff1, scan1);                             \
+    eob_max = __lsx_vmax_h(eob0, eob1);                                   \
+    eob_max;                                                              \
+  })
+
+#define ACCUMULATE_EOB(eob)                   \
+  ({                                          \
+    __m128i eob_shuffled;                     \
+    int16_t res_m;                            \
+                                              \
+    eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); \
+    eob = __lsx_vmax_h(eob, eob_shuffled);    \
+    eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); \
+    eob = __lsx_vmax_h(eob, eob_shuffled);    \
+    eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); \
+    eob = __lsx_vmax_h(eob, eob_shuffled);    \
+    res_m = __lsx_vpickve2gr_h(eob, 1);       \
+    res_m;                                    \
+  })
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                        int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan) {
+  __m128i zero = __lsx_vldi(0);
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, quant_shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  (void)scan;
+
+  zbin = __lsx_vld(zbin_ptr, 0);
+  round = __lsx_vld(round_ptr, 0);
+  quant = __lsx_vld(quant_ptr, 0);
+  dequant = __lsx_vld(dequant_ptr, 0);
+  quant_shift = __lsx_vld(quant_shift_ptr, 0);
+  // Handle one DC and first 15 AC.
+  DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+  qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+  qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+  cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+  zbin = __lsx_vilvh_d(zbin, zbin);
+  cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+  qcoeff0 =
+      CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+  round = __lsx_vilvh_d(round, round);
+  quant = __lsx_vilvh_d(quant, quant);
+  quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+  qcoeff1 =
+      CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+  __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+  __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+  CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = __lsx_vilvh_d(dequant, dequant);
+  CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+  eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = __lsx_vld(coeff_ptr + index, 0);
+    coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+    qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+    qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+    cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+    cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+    qcoeff0 =
+        CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+    qcoeff1 =
+        CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+    __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+    __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+    CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr + index);
+    CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = __lsx_vmax_h(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = ACCUMULATE_EOB(eob);
+}
+
+void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan) {
+  __m128i zero = __lsx_vldi(0);
+  int index;
+
+  __m128i zbin, round, quant, dequant, quant_shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+  (void)n_coeffs;
+
+  zbin = __lsx_vld(zbin_ptr, 0);
+  zbin = __lsx_vsrari_h(zbin, 1);
+  round = __lsx_vld(round_ptr, 0);
+  round = __lsx_vsrari_h(round, 1);
+
+  quant = __lsx_vld(quant_ptr, 0);
+  dequant = __lsx_vld(dequant_ptr, 0);
+  quant_shift = __lsx_vld(quant_shift_ptr, 0);
+  quant_shift = __lsx_vslli_h(quant_shift, 1);
+  // Handle one DC and first 15 AC.
+  DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+  qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+  qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+  cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+  // remove DC from zbin
+  zbin = __lsx_vilvh_d(zbin, zbin);
+  cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+  qcoeff0 =
+      CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+  // remove DC in quant_shift, quant, quant_shift
+  round = __lsx_vilvh_d(round, round);
+  quant = __lsx_vilvh_d(quant, quant);
+  quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+  qcoeff1 =
+      CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+  __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+  __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+  CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = __lsx_vilvh_d(dequant, dequant);
+  CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
+  eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = __lsx_vld(coeff_ptr + index, 0);
+    coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+    qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+    qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+    cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+    cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+    qcoeff0 =
+        CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+    qcoeff1 =
+        CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+    __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+    __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+    CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
+    CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant,
+                                      dqcoeff_ptr + 8 + index);
+    eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = __lsx_vmax_h(eob, eob0);
+  }
+
+  *eob_ptr = ACCUMULATE_EOB(eob);
+}
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 4f5a7a1908..13999af04d 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -328,6 +328,7 @@ DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/quantize_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e82b487f13..d3c668f9ae 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -711,10 +711,10 @@ ()
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/;
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

From 63378a94f996304e2784ecd6584e70cf487991e9 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Mon, 9 May 2022 14:39:05 +0800
Subject: [PATCH 301/926] loongarch: Reduce the number of instructions

Replace some redundant instructions to improve the efficiency
of the program.

1. txfm_macros_lsx.h
2. vpx_convolve8_avg_lsx.c
3. vpx_convolve8_horiz_lsx.c
4. vpx_convolve8_lsx.c
5. vpx_convolve8_vert_lsx.c
6. vpx_convolve_copy_lsx.c
7. vpx_convolve_lsx.h

Bug: webm:1755

Change-Id: I9b7fdf6900338a26f9b1775609ad387648684f3d
---
 vpx_dsp/loongarch/sad_lsx.c                 |   2 +-
 vpx_dsp/loongarch/txfm_macros_lsx.h         |  53 ++---
 vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c   |  49 ++--
 vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c | 161 ++++++-------
 vpx_dsp/loongarch/vpx_convolve8_lsx.c       | 110 ++++-----
 vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c  | 243 ++++++++------------
 vpx_dsp/loongarch/vpx_convolve_copy_lsx.c   |   1 -
 vpx_dsp/loongarch/vpx_convolve_lsx.h        |  25 +-
 8 files changed, 271 insertions(+), 373 deletions(-)

diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
index 46ee557df5..5eaebfb518 100644
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -198,7 +198,7 @@ static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
                                int32_t ref_stride, int32_t height,
                                uint32_t *sad_array) {
   int32_t ht_cnt = (height >> 2);
-  uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   __m128i src0, src1, src2, src3, sad_tmp;
   __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
   __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h
index 977f1c2dd0..bd514831bf 100644
--- a/vpx_dsp/loongarch/txfm_macros_lsx.h
+++ b/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -13,36 +13,29 @@
 
 #include "vpx_util/loongson_intrinsics.h"
 
-#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
-  {                                                           \
-    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;               \
-    __m128i k0_m, k1_m, k2_m, k3_m;                           \
-    __m128i zero = __lsx_vldi(0);                             \
-                                                              \
-    k0_m = __lsx_vreplgr2vr_h(cnst0);                         \
-    k1_m = __lsx_vreplgr2vr_h(cnst1);                         \
-    k2_m = __lsx_vpackev_h(k1_m, k0_m);                       \
-    k0_m = __lsx_vpackev_h(zero, k0_m);                       \
-    k1_m = __lsx_vpackev_h(k1_m, zero);                       \
-                                                              \
-    s5_m = __lsx_vilvl_h(reg1, reg0);                         \
-    s4_m = __lsx_vilvh_h(reg1, reg0);                         \
-    s3_m = __lsx_vilvl_h(reg0, reg1);                         \
-    s2_m = __lsx_vilvh_h(reg0, reg1);                         \
-                                                              \
-    s1_m = __lsx_vdp2_w_h(s5_m, k0_m);                        \
-    s0_m = __lsx_vdp2_w_h(s4_m, k0_m);                        \
-    k3_m = __lsx_vdp2_w_h(s5_m, k1_m);                        \
-    s1_m = __lsx_vsub_w(s1_m, k3_m);                          \
-    k3_m = __lsx_vdp2_w_h(s4_m, k1_m);                        \
-    s0_m = __lsx_vsub_w(s0_m, k3_m);                          \
-                                                              \
-    out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);    \
-                                                              \
-    s1_m = __lsx_vdp2_w_h(s3_m, k2_m);                        \
-    s0_m = __lsx_vdp2_w_h(s2_m, k2_m);                        \
-    out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);    \
-  }
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)         \
+  do {                                                                \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;                       \
+    __m128i k0_m, k1_m, k2_m, k3_m;                                   \
+                                                                      \
+    k0_m = __lsx_vreplgr2vr_h(cnst0);                                 \
+    k1_m = __lsx_vreplgr2vr_h(cnst1);                                 \
+    k2_m = __lsx_vpackev_h(k1_m, k0_m);                               \
+                                                                      \
+    DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m);     \
+    DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m);     \
+                                                                      \
+    DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \
+    k3_m = __lsx_vmulwod_w_h(s5_m, k1_m);                             \
+    s1_m = __lsx_vsub_w(s1_m, k3_m);                                  \
+    k3_m = __lsx_vmulwod_w_h(s4_m, k1_m);                             \
+    s0_m = __lsx_vsub_w(s0_m, k3_m);                                  \
+                                                                      \
+    out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);            \
+                                                                      \
+    DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m);    \
+    out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);            \
+  } while (0)
 
 #define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3)                \
   do {                                                           \
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
index 2b983552b6..54fcd6c571 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -278,7 +278,7 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
   __m128i src0, src1, src2, src3, src4, mask;
   __m128i filt_hz, filt_vt, vec0, vec1;
   __m128i dst0, dst1, dst2, dst3;
-  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, out;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
 
   int32_t src_stride2 = src_stride << 1;
@@ -311,13 +311,12 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
   dst1 = __lsx_vilvl_w(dst3, dst2);
   dst0 = __lsx_vilvl_d(dst1, dst0);
   DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
-  DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-  out = __lsx_vpickev_b(tmp1, tmp0);
-  out = __lsx_vavgr_bu(out, dst0);
-  __lsx_vstelm_w(out, dst, 0, 0);
-  __lsx_vstelm_w(out, dst + dst_stride, 0, 1);
-  __lsx_vstelm_w(out, dst + dst_stride2, 0, 2);
-  __lsx_vstelm_w(out, dst + dst_stride3, 0, 3);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+  tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
@@ -386,9 +385,8 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
             hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
   DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
             filt_vt, tmp0, tmp1, tmp2, tmp3);
-  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
-            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
-  DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, res0, res1);
   DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
 
   __lsx_vstelm_w(res0, dst, 0, 0);
@@ -467,10 +465,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
   hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
   vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
   tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
-
-  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
-            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
-  PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+  AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
@@ -513,8 +510,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-
     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
     tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
@@ -522,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
-
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp1);
 
     dst0 = __lsx_vldrepl_d(dst_tmp, 0);
     dst_tmp += dst_stride;
@@ -534,7 +529,7 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
     dst3 = __lsx_vldrepl_d(dst_tmp, 0);
     dst_tmp += dst_stride;
     DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
-    PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+    AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
     dst += dst_stride;
   }
 }
@@ -597,8 +592,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     tmp3 = __lsx_vavgr_bu(tmp3, dst0);
     __lsx_vst(tmp3, dst, 0);
 
@@ -606,8 +600,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     tmp3 = __lsx_vavgr_bu(tmp3, dst1);
     __lsx_vstx(tmp3, dst, dst_stride);
 
@@ -615,8 +608,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     tmp3 = __lsx_vavgr_bu(tmp3, dst2);
     __lsx_vstx(tmp3, dst, dst_stride2);
 
@@ -624,8 +616,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     tmp3 = __lsx_vavgr_bu(tmp3, dst3);
     __lsx_vstx(tmp3, dst, dst_stride3);
     dst += dst_stride4;
@@ -642,8 +633,6 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_lsx(
 
   common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
                                          filter_horiz, filter_vert, height);
-  src += 16;
-  dst += 16;
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_64w_lsx(
diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
index 5d67d65274..2c6459a978 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -338,8 +338,7 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
   __m128i src0, src1, src2, src3, mask;
-  __m128i filt0, vec0, vec1, res0, res1;
-  __m128i vec2, vec3;
+  __m128i filt0, vec0, vec1, vec2, vec3, res0, res1;
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride + src_stride2;
 
@@ -355,8 +354,8 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
   src3 = __lsx_vldx(src, src_stride3);
   DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
   DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
-  DUP2_ARG2(__lsx_vsrari_h, vec2, FILTER_BITS, vec3, FILTER_BITS, vec2, vec3);
-  DUP2_ARG2(__lsx_vpickev_b, vec2, vec2, vec3, vec3, res0, res1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3,
+            FILTER_BITS, res0, res1);
 
   __lsx_vstelm_w(res0, dst, 0, 0);
   __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
@@ -367,10 +366,9 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
 static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
-  __m128i vec0, vec1, vec2, vec3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  __m128i res0, res1, res2, res3;
-  __m128i vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, filt0;
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride + src_stride2;
   int32_t src_stride4 = src_stride2 << 1;
@@ -396,10 +394,10 @@ static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
             src7, src6, mask, vec0, vec1, vec2, vec3);
   DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
             vec4, vec5, vec6, vec7);
-  DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6,
-            FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7);
-  DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
-            res0, res1, res2, res3);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+            res1, res2, res3);
+
   __lsx_vstelm_w(res0, dst, 0, 0);
   dst += dst_stride;
   __lsx_vstelm_w(res0, dst, 0, 1);
@@ -451,14 +449,13 @@ static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
             src3, src3, mask, vec0, vec1, vec2, vec3);
   DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
             vec0, vec1, vec2, vec3);
-  DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
-            FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
-  DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, src0, src1);
-
-  __lsx_vstelm_d(src0, dst, 0, 0);
-  __lsx_vstelm_d(src0, dst + dst_stride, 0, 1);
-  __lsx_vstelm_d(src1, dst + dst_stride2, 0, 0);
-  __lsx_vstelm_d(src1, dst + dst_stride3, 0, 1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec1);
+
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1);
 }
 
 static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
@@ -490,15 +487,9 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
             src3, src3, mask, vec0, vec1, vec2, vec3);
   DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
             vec0, vec1, vec2, vec3);
-  DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
-            FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
-
-  src0 = __lsx_vld(src, 0);
-  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
-  src3 = __lsx_vldx(src, src_stride3);
-  src += src_stride4;
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, out0, out1);
 
-  DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
   __lsx_vstelm_d(out0, dst, 0, 0);
   dst += dst_stride;
   __lsx_vstelm_d(out0, dst, 0, 1);
@@ -508,13 +499,17 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
   __lsx_vstelm_d(out1, dst, 0, 1);
   dst += dst_stride;
 
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
   DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
             src3, src3, mask, vec0, vec1, vec2, vec3);
   DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
             vec0, vec1, vec2, vec3);
-  DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
-            FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
-  DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, out0, out1);
 
   __lsx_vstelm_d(out0, dst, 0, 0);
   dst += dst_stride;
@@ -537,27 +532,25 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
               mask, src3, src3, mask, vec0, vec1, vec2, vec3);
     DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
               filt0, vec0, vec1, vec2, vec3);
-    DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
-              FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
-
-    src0 = __lsx_vld(src, 0);
-    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
-    src3 = __lsx_vldx(src, src_stride3);
-    src += src_stride4;
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, out0, out1);
 
-    DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
     __lsx_vstelm_d(out0, dst, 0, 0);
     __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
     __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
     __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
 
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
     DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
               mask, src3, src3, mask, vec0, vec1, vec2, vec3);
     DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
               filt0, vec0, vec1, vec2, vec3);
-    DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
-              FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
-    DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, out0, out1);
 
     __lsx_vstelm_d(out0, dst_tmp1, 0, 0);
     __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
@@ -582,7 +575,7 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
   uint32_t loop_cnt = (height >> 2) - 1;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
   __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
 
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
@@ -609,22 +602,17 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
             out0, out1, out2, out3);
   DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
             out4, out5, out6, out7);
-  DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
-            FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
-  DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
-            FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+            FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0,
+            out1, out2, out3);
 
-  tmp = __lsx_vpickev_b(out1, out0);
-  __lsx_vst(tmp, dst, 0);
+  __lsx_vst(out0, dst, 0);
   dst += dst_stride;
-  tmp = __lsx_vpickev_b(out3, out2);
-  __lsx_vst(tmp, dst, 0);
+  __lsx_vst(out1, dst, 0);
   dst += dst_stride;
-  tmp = __lsx_vpickev_b(out5, out4);
-  __lsx_vst(tmp, dst, 0);
+  __lsx_vst(out2, dst, 0);
   dst += dst_stride;
-  tmp = __lsx_vpickev_b(out7, out6);
-  __lsx_vst(tmp, dst, 0);
+  __lsx_vst(out3, dst, 0);
   dst += dst_stride;
 
   for (; loop_cnt--;) {
@@ -648,22 +636,17 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
               filt0, out0, out1, out2, out3);
     DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
               filt0, out4, out5, out6, out7);
-    DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
-              FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
-    DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
-              FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
 
-    tmp = __lsx_vpickev_b(out1, out0);
-    __lsx_vst(tmp, dst, 0);
+    __lsx_vst(out0, dst, 0);
     dst += dst_stride;
-    tmp = __lsx_vpickev_b(out3, out2);
-    __lsx_vst(tmp, dst, 0);
+    __lsx_vst(out1, dst, 0);
     dst += dst_stride;
-    tmp = __lsx_vpickev_b(out5, out4);
-    __lsx_vst(tmp, dst, 0);
+    __lsx_vst(out2, dst, 0);
     dst += dst_stride;
-    tmp = __lsx_vpickev_b(out7, out6);
-    __lsx_vst(tmp, dst, 0);
+    __lsx_vst(out3, dst, 0);
     dst += dst_stride;
   }
 }
@@ -674,7 +657,7 @@ static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
   uint32_t loop_cnt = (height >> 1);
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
   __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
 
   mask = __lsx_vld(mc_filt_mask_arr, 0);
@@ -699,21 +682,16 @@ static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
               filt0, out0, out1, out2, out3);
     DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
               filt0, out4, out5, out6, out7);
-    DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
-              FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
-    DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
-              FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
-
-    tmp = __lsx_vpickev_b(out1, out0);
-    __lsx_vst(tmp, dst, 0);
-    tmp = __lsx_vpickev_b(out3, out2);
-    __lsx_vst(tmp, dst, 16);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
     dst += dst_stride;
 
-    tmp = __lsx_vpickev_b(out5, out4);
-    __lsx_vst(tmp, dst, 0);
-    tmp = __lsx_vpickev_b(out7, out6);
-    __lsx_vst(tmp, dst, 16);
+    __lsx_vst(out2, dst, 0);
+    __lsx_vst(out3, dst, 16);
     dst += dst_stride;
   }
 }
@@ -724,7 +702,7 @@ static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
   uint32_t loop_cnt = height;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
   __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
 
   mask = __lsx_vld(mc_filt_mask_arr, 0);
@@ -749,19 +727,14 @@ static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
               filt0, out0, out1, out2, out3);
     DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
               filt0, out4, out5, out6, out7);
-    DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
-              FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
-    DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
-              FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
-
-    tmp = __lsx_vpickev_b(out1, out0);
-    __lsx_vst(tmp, dst, 0);
-    tmp = __lsx_vpickev_b(out3, out2);
-    __lsx_vst(tmp, dst, 16);
-    tmp = __lsx_vpickev_b(out5, out4);
-    __lsx_vst(tmp, dst, 32);
-    tmp = __lsx_vpickev_b(out7, out6);
-    __lsx_vst(tmp, dst, 48);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    __lsx_vst(out2, dst, 32);
+    __lsx_vst(out3, dst, 48);
     dst += dst_stride;
   }
 }
diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
index 894c137203..73583abb98 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -248,7 +248,7 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
                                       int8_t *filter_horiz,
                                       int8_t *filter_vert) {
   __m128i src0, src1, src2, src3, src4, mask;
-  __m128i filt_vt, filt_hz, vec0, vec1, res0, res1;
+  __m128i filt_vt, filt_hz, vec0, vec1;
   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
 
@@ -276,13 +276,13 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
 
   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
   DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
-  DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-  DUP2_ARG2(__lsx_vpickev_b, tmp0, tmp0, tmp1, tmp1, res0, res1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
+            FILTER_BITS, tmp0, tmp1);
 
-  __lsx_vstelm_w(res0, dst, 0, 0);
-  __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
-  __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
-  __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
 }
 
 static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
@@ -290,7 +290,6 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
                                       int8_t *filter_horiz,
                                       int8_t *filter_vert) {
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  __m128i res0, res1, res2, res3;
   __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
@@ -331,20 +330,19 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
             hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
   DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
             filt_vt, vec4, vec5, vec6, vec7);
-  DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6,
-            FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7);
-  DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
-            res0, res1, res2, res3);
-
-  __lsx_vstelm_w(res0, dst, 0, 0);
-  __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
-  __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
-  __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
+            vec5, vec6, vec7);
+
+  __lsx_vstelm_w(vec4, dst, 0, 0);
+  __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
   dst += dst_stride4;
-  __lsx_vstelm_w(res2, dst, 0, 0);
-  __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
-  __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
-  __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+  __lsx_vstelm_w(vec6, dst, 0, 0);
+  __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
 }
 
 static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
@@ -364,7 +362,7 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
                                       uint8_t *dst, int32_t dst_stride,
                                       int8_t *filter_horiz,
                                       int8_t *filter_vert) {
-  __m128i src0, src1, src2, src3, src4, mask, out0, out1;
+  __m128i src0, src1, src2, src3, src4, mask;
   __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
   __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
 
@@ -401,14 +399,13 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
   vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
   tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
 
-  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
-            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
-  DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
 
-  __lsx_vstelm_d(out0, dst, 0, 0);
-  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
-  __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
-  __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
 }
 
 static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
@@ -417,9 +414,9 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
                                           int8_t *filter_horiz,
                                           int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt = (height >> 3);
-  __m128i src0, src1, src2, src3, src4, mask, out0, out1;
+  __m128i src0, src1, src2, src3, src4, mask;
   __m128i filt_hz, filt_vt, vec0;
-  __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
 
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
@@ -449,8 +446,6 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
-
     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
     tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
@@ -463,43 +458,44 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    DUP2_ARG2(__lsx_vsrari_h, tmp3, FILTER_BITS, tmp4, FILTER_BITS, tmp3, tmp4);
-    DUP2_ARG2(__lsx_vpickev_b, tmp2, tmp1, tmp4, tmp3, out0, out1);
-    __lsx_vstelm_d(out0, dst, 0, 0);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+              FILTER_BITS, tmp1, tmp2);
+
+    __lsx_vstelm_d(tmp1, dst, 0, 0);
     dst += dst_stride;
-    __lsx_vstelm_d(out0, dst, 0, 1);
+    __lsx_vstelm_d(tmp1, dst, 0, 1);
     dst += dst_stride;
-    __lsx_vstelm_d(out1, dst, 0, 0);
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
     dst += dst_stride;
-    __lsx_vstelm_d(out1, dst, 0, 1);
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
     dst += dst_stride;
 
     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
-    tmp5 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
-    tmp6 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
-    tmp7 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
-    tmp8 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    DUP4_ARG2(__lsx_vsrari_h, tmp5, FILTER_BITS, tmp6, FILTER_BITS, tmp7,
-              FILTER_BITS, tmp8, FILTER_BITS, tmp5, tmp6, tmp7, tmp8);
-    DUP2_ARG2(__lsx_vpickev_b, tmp6, tmp5, tmp8, tmp7, out0, out1);
-    __lsx_vstelm_d(out0, dst, 0, 0);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+              FILTER_BITS, tmp1, tmp2);
+
+    __lsx_vstelm_d(tmp1, dst, 0, 0);
     dst += dst_stride;
-    __lsx_vstelm_d(out0, dst, 0, 1);
+    __lsx_vstelm_d(tmp1, dst, 0, 1);
     dst += dst_stride;
-    __lsx_vstelm_d(out1, dst, 0, 0);
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
     dst += dst_stride;
-    __lsx_vstelm_d(out1, dst, 0, 1);
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
     dst += dst_stride;
   }
 }
@@ -554,8 +550,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
-    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
-    tmp = __lsx_vpickev_b(tmp2, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
     __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
@@ -563,8 +558,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
-    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
-    tmp = __lsx_vpickev_b(tmp2, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
     __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
@@ -572,8 +566,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
-    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
-    tmp = __lsx_vpickev_b(tmp2, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
     __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
@@ -581,8 +574,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
-    DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
-    tmp = __lsx_vpickev_b(tmp2, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
     __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
   }
@@ -599,8 +591,6 @@ static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
 
   common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
                             filter_vert, height);
-  src += 16;
-  dst += 16;
 }
 
 static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
index c0bb10f3b7..7e3a95b2fd 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -361,13 +361,12 @@ static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
   __m128i src0, src1, src2, src3, src4;
-  __m128i src10_l, src32_l, src21_l, src43_l, src2110, src4332;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
   __m128i filt0, tmp0, tmp1;
 
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
   int32_t src_stride4 = src_stride2 << 1;
-
   int32_t dst_stride2 = dst_stride << 1;
   int32_t dst_stride3 = dst_stride2 + dst_stride;
 
@@ -378,37 +377,33 @@ static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
             src, src_stride4, src1, src2, src3, src4);
   src += (src_stride4 + src_stride);
 
-  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
-            src10_l, src21_l, src32_l, src43_l);
-  DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src2110,
-            src4332);
-  DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
-  DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-  src2110 = __lsx_vpickev_b(tmp1, tmp0);
-
-  __lsx_vstelm_w(src2110, dst, 0, 0);
-  __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1);
-  __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2);
-  __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
 }
 
 static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
-  __m128i src65_l, src87_l, src2110, src4332, src6554, src8776;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+  __m128i vec6, vec7, vec8, vec9, vec10, vec11;
   __m128i tmp0, tmp1, tmp2, tmp3;
   __m128i filt0;
 
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
   int32_t src_stride4 = src_stride2 << 1;
-
   int32_t dst_stride2 = dst_stride << 1;
   int32_t dst_stride3 = dst_stride2 + dst_stride;
   int32_t dst_stride4 = dst_stride2 << 1;
-
   uint8_t *dst_tmp1 = dst + dst_stride4;
 
   filt0 = __lsx_vldrepl_h(filter, 0);
@@ -420,27 +415,27 @@ static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
             src, src_stride4, src5, src6, src7, src8);
   src += (src_stride4 + src_stride);
 
-  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
-            src10_l, src21_l, src32_l, src43_l);
-  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
-            src54_l, src65_l, src76_l, src87_l);
-  DUP4_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
-            src87_l, src76_l, src2110, src4332, src6554, src8776);
-  DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
-            src8776, filt0, tmp0, tmp1, tmp2, tmp3);
-  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
-            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
-  DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, src2110, src4332);
-
-  __lsx_vstelm_w(src2110, dst, 0, 0);
-  __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1);
-  __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2);
-  __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3);
-
-  __lsx_vstelm_w(src4332, dst_tmp1, 0, 0);
-  __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride, 0, 1);
-  __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride2, 0, 2);
-  __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride3, 0, 3);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4,
+            vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8,
+            vec9, vec10, vec11);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11,
+            filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+
+  __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3);
 }
 
 static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
@@ -457,17 +452,14 @@ static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
   __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
-  __m128i out0, out1;
-  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
 
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
   int32_t src_stride4 = src_stride2 << 1;
-
   int32_t dst_stride2 = dst_stride << 1;
   int32_t dst_stride3 = dst_stride2 + dst_stride;
 
-  /* rearranging filter_y */
   filt0 = __lsx_vldrepl_h(filter, 0);
 
   src0 = __lsx_vld(src, 0);
@@ -478,9 +470,8 @@ static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
             vec1, vec2, vec3);
   DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
             tmp0, tmp1, tmp2, tmp3);
-  DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
-            FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
-  DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, out0, out1);
 
   __lsx_vstelm_d(out0, dst, 0, 0);
   __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
@@ -494,13 +485,11 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
   uint32_t loop_cnt = (height >> 3);
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  __m128i out0, out1;
-  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
 
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
   int32_t src_stride4 = src_stride2 << 1;
-
   int32_t dst_stride2 = dst_stride << 1;
   int32_t dst_stride3 = dst_stride2 + dst_stride;
   int32_t dst_stride4 = dst_stride2 << 1;
@@ -525,9 +514,9 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
               vec4, vec5, vec6, vec7);
     DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
               filt0, tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
-              FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+
     __lsx_vstelm_d(out0, dst, 0, 0);
     __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
     __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
@@ -536,9 +525,9 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
 
     DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
               filt0, tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
-              FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+
     __lsx_vstelm_d(out0, dst, 0, 0);
     __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
     __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
@@ -559,29 +548,17 @@ static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
   }
 }
 
-static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int y0_q4,
-                                 int y_step_q4, int w, int height) {
+static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
   uint32_t loop_cnt = (height >> 2);
-  __m128i src0, src1, src2, src3, src4;
+  __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
   int32_t src_stride4 = src_stride2 << 1;
 
-  const int16_t *const filter_y = filter[y0_q4];
-  int8_t cnt, filt_ver[8];
-
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 8; cnt--;) {
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  filt0 = __lsx_vldrepl_h(&filt_ver[3], 0);
+  filt0 = __lsx_vldrepl_h(filter, 0);
 
   src0 = __lsx_vld(src, 0);
   src += src_stride;
@@ -595,29 +572,25 @@ static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride,
     DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
     DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
-    __lsx_vst(tmp4, dst, 0);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
     DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
     DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
-    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
-    __lsx_vst(tmp4, dst, 0);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
     DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
-    __lsx_vst(tmp4, dst, 0);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
-    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
-    __lsx_vst(tmp4, dst, 0);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
     src0 = src4;
@@ -630,20 +603,18 @@ static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
   uint32_t loop_cnt = (height >> 2);
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
+  __m128i tmp, tmp0, tmp1;
 
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
   int32_t src_stride4 = src_stride2 << 1;
-
   int32_t dst_stride2 = dst_stride << 1;
   int32_t dst_stride3 = dst_stride2 + dst_stride;
-
   uint8_t *src_tmp;
+
   filt0 = __lsx_vldrepl_h(filter, 0);
 
-  src0 = __lsx_vld(src, 0);
-  src5 = __lsx_vld(src, 16);
+  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
   src += src_stride;
   src_tmp = src + 16;
 
@@ -658,53 +629,45 @@ static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
     src_tmp += src_stride4;
 
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
-    __lsx_vst(tmp4, dst, 0);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
-    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
-    __lsx_vstx(tmp4, dst, dst_stride);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride);
 
     DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
     DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
-    __lsx_vstx(tmp4, dst, dst_stride2);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride2);
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
-    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
-    __lsx_vstx(tmp4, dst, dst_stride3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride3);
 
     DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
     DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
-    __lsx_vst(tmp4, dst, 16);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 16);
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     dst += dst_stride;
-    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
-    __lsx_vst(tmp4, dst, 16);
+    __lsx_vst(tmp, dst, 16);
 
     DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
     DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     dst += dst_stride;
-    tmp4 = __lsx_vpickev_b(tmp1, tmp0);
-    __lsx_vst(tmp4, dst, 16);
+    __lsx_vst(tmp, dst, 16);
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     dst += dst_stride;
-    tmp4 = __lsx_vpickev_b(tmp3, tmp2);
-    __lsx_vst(tmp4, dst, 16);
+    __lsx_vst(tmp, dst, 16);
 
     dst += dst_stride;
 
@@ -719,7 +682,7 @@ static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
   uint32_t loop_cnt = (height >> 1);
   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
   __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  __m128i tmp, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp, tmp0, tmp1;
 
   int32_t src_stride2 = src_stride << 1;
   int32_t dst_stride2 = dst_stride << 1;
@@ -743,49 +706,41 @@ static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
     DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
     DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp = __lsx_vpickev_b(tmp1, tmp0);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     __lsx_vst(tmp, dst, 0);
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
-    tmp = __lsx_vpickev_b(tmp3, tmp2);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     __lsx_vst(tmp, dst_tmp1, 0);
 
     DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
     DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5);
-    DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5);
-    tmp = __lsx_vpickev_b(tmp5, tmp4);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     __lsx_vst(tmp, dst, 16);
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7);
-    DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7);
-    tmp = __lsx_vpickev_b(tmp7, tmp6);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     __lsx_vst(tmp, dst_tmp1, 16);
 
     DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
     DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
-    DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-    tmp = __lsx_vpickev_b(tmp1, tmp0);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     __lsx_vst(tmp, dst, 32);
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
-    DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
-    tmp = __lsx_vpickev_b(tmp3, tmp2);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     __lsx_vst(tmp, dst_tmp1, 32);
 
     DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
     DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5);
-    DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5);
-    tmp = __lsx_vpickev_b(tmp5, tmp4);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     __lsx_vst(tmp, dst, 48);
 
-    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7);
-    DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7);
-    tmp = __lsx_vpickev_b(tmp7, tmp6);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     __lsx_vst(tmp, dst_tmp1, 48);
     dst += dst_stride2;
     dst_tmp1 += dst_stride2;
@@ -823,8 +778,8 @@ void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
                             &filt_ver[3], h);
         break;
       case 16:
-        common_vt_2t_16w_lsx(src, src_stride, dst, dst_stride, filter, y0_q4,
-                             y_step_q4, w, h);
+        common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
         break;
       case 32:
         common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
diff --git a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
index 398788a43e..53dc7097ed 100644
--- a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
@@ -15,7 +15,6 @@
 static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
                             uint8_t *dst, int32_t dst_stride, int32_t height) {
   int32_t cnt;
-  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride3 = src_stride2 + src_stride;
diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h
index d319bc4f7d..2428407f2b 100644
--- a/vpx_dsp/loongarch/vpx_convolve_lsx.h
+++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -125,19 +125,18 @@
     tmp1_m;                                              \
   })
 
-#define PCKEV_AVG_ST4_D(in0, in1, in2, in3, dst0, dst1, pdst, stride)      \
-  {                                                                        \
-    __m128i tmp0_m, tmp1_m;                                                \
-                                                                           \
-    DUP2_ARG2(__lsx_vpickev_b, in1, in0, in3, in2, tmp0_m, tmp1_m);        \
-    DUP2_ARG2(__lsx_vavgr_bu, tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
-    __lsx_vstelm_d(tmp0_m, pdst, 0, 0);                                    \
-    pdst += stride;                                                        \
-    __lsx_vstelm_d(tmp0_m, pdst, 0, 1);                                    \
-    pdst += stride;                                                        \
-    __lsx_vstelm_d(tmp1_m, pdst, 0, 0);                                    \
-    pdst += stride;                                                        \
-    __lsx_vstelm_d(tmp1_m, pdst, 0, 1);                                    \
+#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride)                \
+  {                                                                  \
+    __m128i tmp0_m, tmp1_m;                                          \
+                                                                     \
+    DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 0);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 1);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 0);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 1);                              \
   }
 
 #endif  // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_

From f92c451e6c03685e28217f2080cc52a994938664 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 17 May 2022 19:06:04 +0800
Subject: [PATCH 302/926] loongarch: Modify the representation of macros

Some macros have been changed to "#define do {...} While (0)",
change the rest to "static INLINE ..."

Bug: webm:1755

Change-Id: I445ac0c543f12df38f086b479394b111058367d0
---
 vp8/common/loongarch/idct_lsx.c               |  39 +--
 vp8/common/loongarch/loopfilter_filters_lsx.c |  16 +-
 vp8/common/loongarch/sixtap_filter_lsx.c      | 322 +++++++++---------
 vpx_dsp/loongarch/bitdepth_conversion_lsx.h   |  43 +--
 vpx_dsp/loongarch/fwd_dct32x32_lsx.c          |   4 +-
 vpx_dsp/loongarch/fwd_txfm_lsx.c              |   4 +-
 vpx_dsp/loongarch/fwd_txfm_lsx.h              |  40 +--
 vpx_dsp/loongarch/idct32x32_lsx.c             |   4 +-
 vpx_dsp/loongarch/loopfilter_16_lsx.c         |   8 +-
 vpx_dsp/loongarch/loopfilter_lsx.h            |  20 +-
 vpx_dsp/loongarch/quantize_lsx.c              | 192 ++++++-----
 vpx_dsp/loongarch/sad_lsx.c                   | 231 +++++++------
 vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c     | 102 +++---
 .../loongarch/vpx_convolve8_avg_vert_lsx.c    |  28 +-
 vpx_dsp/loongarch/vpx_convolve8_lsx.c         | 110 +++---
 vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c    |  44 +--
 vpx_dsp/loongarch/vpx_convolve_lsx.h          | 100 +++---
 17 files changed, 654 insertions(+), 653 deletions(-)

diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c
index 679019ff63..eee871eec4 100644
--- a/vp8/common/loongarch/idct_lsx.c
+++ b/vp8/common/loongarch/idct_lsx.c
@@ -16,47 +16,44 @@ static const int32_t cospi8sqrt2minus1 = 20091;
 static const int32_t sinpi8sqrt2 = 35468;
 
 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)    \
-  {                                                                       \
+  do {                                                                    \
     __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
                                                                           \
     DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m);         \
     DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m);         \
     DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
     DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
-  }
+  } while (0)
 
 #define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                     \
+  do {                                                                  \
     __m128i s4_m, s5_m, s6_m, s7_m;                                     \
                                                                         \
     TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);     \
     DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2);       \
     out1 = __lsx_vilvh_d(s6_m, s4_m);                                   \
     out3 = __lsx_vilvh_d(s7_m, s5_m);                                   \
-  }
+  } while (0)
 
-#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in)         \
-  ({                                                          \
-    __m128i out_m;                                            \
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1)   \
+  do {                                                        \
     __m128i zero_m = __lsx_vldi(0);                           \
     __m128i tmp1_m, tmp2_m;                                   \
     __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \
                                                               \
-    tmp1_m = __lsx_vilvl_h(in, zero_m);                       \
-    tmp2_m = __lsx_vilvh_h(in, zero_m);                       \
+    tmp1_m = __lsx_vilvl_h(in0, zero_m);                      \
+    tmp2_m = __lsx_vilvh_h(in0, zero_m);                      \
     tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
     tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
     tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m);            \
     tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
     tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m);            \
     tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
-    out_m = __lsx_vpickev_h(tmp2_m, tmp1_m);                  \
-                                                              \
-    out_m;                                                    \
-  })
+    in1 = __lsx_vpickev_h(tmp2_m, tmp1_m);                    \
+  } while (0)
 
 #define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)      \
-  {                                                                    \
+  do {                                                                 \
     __m128i a1_m, b1_m, c1_m, d1_m;                                    \
     __m128i c_tmp1_m, c_tmp2_m;                                        \
     __m128i d_tmp1_m, d_tmp2_m;                                        \
@@ -65,7 +62,7 @@ static const int32_t sinpi8sqrt2 = 35468;
     const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \
     a1_m = __lsx_vadd_h(in0, in2);                                     \
     b1_m = __lsx_vsub_h(in0, in2);                                     \
-    c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1);         \
+    EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m);          \
                                                                        \
     c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m);           \
     c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1);                             \
@@ -77,13 +74,13 @@ static const int32_t sinpi8sqrt2 = 35468;
     d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1);                             \
     d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1);                             \
     d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m);                            \
-    d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3);         \
+    EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m);          \
     d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m);                           \
     LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
-  }
+  } while (0)
 
 #define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)      \
-  {                                                                    \
+  do {                                                                 \
     __m128i a1_m, b1_m, c1_m, d1_m;                                    \
     __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                    \
     __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;                 \
@@ -105,13 +102,13 @@ static const int32_t sinpi8sqrt2 = 35468;
     d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16);                            \
     d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m);                           \
     LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
-  }
+  } while (0)
 
 #define UNPCK_SH_SW(in, out0, out1)  \
-  {                                  \
+  do {                               \
     out0 = __lsx_vsllwil_w_h(in, 0); \
     out1 = __lsx_vexth_w_h(in);      \
-  }
+  } while (0)
 
 static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred,
                                  int32_t pred_stride, uint8_t *dest,
diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c
index a3ac76d258..f743ec0c50 100644
--- a/vp8/common/loongarch/loopfilter_filters_lsx.c
+++ b/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -14,7 +14,7 @@
 #include "vpx_util/loongson_intrinsics.h"
 
 #define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev)        \
-  {                                                          \
+  do {                                                       \
     __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
     const __m128i cnst4b = __lsx_vldi(4);                    \
     const __m128i cnst3b = __lsx_vldi(3);                    \
@@ -46,10 +46,10 @@
     q1 = __lsx_vxori_b(q1_m, 0x80);                          \
     p1_m = __lsx_vsadd_b(p1_m, filt);                        \
     p1 = __lsx_vxori_b(p1_m, 0x80);                          \
-  }
+  } while (0)
 
 #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
-  {                                                     \
+  do {                                                  \
     __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;         \
     __m128i u, filt, t1, t2, filt_sign, q0_sub_p0;      \
     __m128i filt_r, filt_l;                             \
@@ -113,12 +113,12 @@
     p0_m = __lsx_vsadd_b(p0_m, u);                      \
     q0 = __lsx_vxori_b(q0_m, 0x80);                     \
     p0 = __lsx_vxori_b(p0_m, 0x80);                     \
-  }
+  } while (0)
 
 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
                      limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
                      flat_out)                                               \
-  {                                                                          \
+  do {                                                                       \
     __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
     __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
                                                                              \
@@ -143,13 +143,13 @@
     mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
     mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
     mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
-  }
+  } while (0)
 
 #define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \
-  {                                                           \
+  do {                                                        \
     __lsx_vstelm_w(in0, pdst, 0, in0_idx);                    \
     __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx);           \
-  }
+  } while (0)
 
 static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
                                               const uint8_t *b_limit0_ptr,
diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c
index a23ed16d2b..cd7ba54746 100644
--- a/vp8/common/loongarch/sixtap_filter_lsx.c
+++ b/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -33,37 +33,61 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
   8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 };
 
-#define DPADD_H3(in0, in1, in2, coeff0, coeff1, coeff2) \
-  ({                                                    \
-    __m128i out0_m;                                     \
-                                                        \
-    out0_m = __lsx_vdp2_h_b(in0, coeff0);               \
-    out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);    \
-    out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);    \
-                                                        \
-    out0_m;                                             \
-  })
-
-#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1,  \
-                        filt_h2)                                            \
-  ({                                                                        \
-    __m128i vec0_m, vec1_m, vec2_m;                                         \
-    __m128i hz_out_m;                                                       \
-                                                                            \
-    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,  \
-              vec1_m);                                                      \
-    vec2_m = __lsx_vshuf_b(src1, src0, mask2);                              \
-    hz_out_m = DPADD_H3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \
-                                                                            \
-    hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
-    hz_out_m = __lsx_vsat_h(hz_out_m, 7);                                   \
-                                                                            \
-    hz_out_m;                                                               \
-  })
+static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2,
+                               __m128i coeff0, __m128i coeff1, __m128i coeff2) {
+  __m128i out0_m;
+
+  out0_m = __lsx_vdp2_h_b(in0, coeff0);
+  out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);
+  out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);
+
+  return out0_m;
+}
+
+static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+                                      __m128i mask1, __m128i mask2,
+                                      __m128i filt_h0, __m128i filt_h1,
+                                      __m128i filt_h2) {
+  __m128i vec0_m, vec1_m, vec2_m;
+  __m128i hz_out_m;
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+            vec1_m);
+  vec2_m = __lsx_vshuf_b(src1, src0, mask2);
+  hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);
+  hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+  hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+  return hz_out_m;
+}
+
+static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1,
+                                        __m128i filt0, __m128i filt1) {
+  __m128i tmp_m;
+
+  tmp_m = __lsx_vdp2_h_b(vec0, filt0);
+  tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1);
+
+  return tmp_m;
+}
+
+static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+                                      __m128i mask1, __m128i filt_h0,
+                                      __m128i filt_h1) {
+  __m128i vec0_m, vec1_m, hz_out_m;
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+            vec1_m);
+  hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1);
+  hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+  hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+  return hz_out_m;
+}
 
 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
                                    mask2, filt0, filt1, filt2, out0, out1) \
-  {                                                                        \
+  do {                                                                     \
     __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                \
                                                                            \
     DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
@@ -77,12 +101,12 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
               vec5_m);                                                     \
     DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
               out0, out1);                                                 \
-  }
+  } while (0)
 
 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
                                    mask2, filt0, filt1, filt2, out0, out1,  \
                                    out2, out3)                              \
-  ({                                                                        \
+  do {                                                                      \
     __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
                                                                             \
     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m,  \
@@ -105,35 +129,11 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
     DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2,  \
               out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2,   \
               out3);                                                        \
-  })
-
-#define FILT_4TAP_DPADD_H(vec0, vec1, filt0, filt1) \
-  ({                                                \
-    __m128i tmp0;                                   \
-                                                    \
-    tmp0 = __lsx_vdp2_h_b(vec0, filt0);             \
-    tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1);    \
-                                                    \
-    tmp0;                                           \
-  })
-
-#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)        \
-  ({                                                                       \
-    __m128i vec0_m, vec1_m;                                                \
-    __m128i hz_out_m;                                                      \
-                                                                           \
-    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \
-              vec1_m);                                                     \
-    hz_out_m = FILT_4TAP_DPADD_H(vec0_m, vec1_m, filt_h0, filt_h1);        \
-    hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);                 \
-    hz_out_m = __lsx_vsat_h(hz_out_m, 7);                                  \
-                                                                           \
-    hz_out_m;                                                              \
-  })
+  } while (0)
 
 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
                                    filt0, filt1, out0, out1)               \
-  {                                                                        \
+  do {                                                                     \
     __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
                                                                            \
     DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
@@ -143,11 +143,11 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
               vec3_m);                                                     \
     DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
               out0, out1);                                                 \
-  }
+  } while (0)
 
 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
                                    filt0, filt1, out0, out1, out2, out3)   \
-  ({                                                                       \
+  do {                                                                     \
     __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
                                                                            \
     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
@@ -163,7 +163,7 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
     DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
               out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,  \
               out3);                                                       \
-  })
+  } while (0)
 
 static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
                                         int32_t src_stride,
@@ -424,8 +424,8 @@ static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
     DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
               src8776);
     DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
-    out0 = DPADD_H3(src2110, src4332, src6554, filt0, filt1, filt2);
-    out1 = DPADD_H3(src4332, src6554, src8776, filt0, filt1, filt2);
+    out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2);
+    out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2);
 
     out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
     out0 = __lsx_vxori_b(out0, 128);
@@ -487,10 +487,10 @@ static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
 
     DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
               src76_r, src87_r, src98_r, src109_r);
-    out0_r = DPADD_H3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
-    out1_r = DPADD_H3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
-    out2_r = DPADD_H3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
-    out3_r = DPADD_H3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+    out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
     DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
               out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -555,14 +555,14 @@ static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
               src54_r, src65_r, src76_r, src87_r);
     DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
               src54_l, src65_l, src76_l, src87_l);
-    out0_r = DPADD_H3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
-    out1_r = DPADD_H3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
-    out2_r = DPADD_H3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
-    out3_r = DPADD_H3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
-    out0_l = DPADD_H3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
-    out1_l = DPADD_H3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
-    out2_l = DPADD_H3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
-    out3_l = DPADD_H3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
     DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
               out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
               out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
@@ -621,12 +621,12 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
             src1, src2, src3);
   src4 = __lsx_vxori_b(src4, 128);
 
-  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
-  hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
   hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
-  hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
 
@@ -636,7 +636,7 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
     src += src_stride_x2;
 
     DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
-    hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+    hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
 
@@ -645,15 +645,15 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
     src += src_stride_x2;
 
     DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
-    hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+    hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
 
     out2 = __lsx_vpackev_b(hz_out5, hz_out4);
-    tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
 
     out3 = __lsx_vpackev_b(hz_out7, hz_out6);
-    tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+    tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
 
     tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
     tmp0 = __lsx_vxori_b(tmp0, 128);
@@ -710,15 +710,15 @@ static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
             src1, src2, src3);
   src4 = __lsx_vxori_b(src4, 128);
 
-  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
-  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
-  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
-  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
-  hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
   filt = __lsx_vld(filter_vert, 0);
   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
@@ -734,25 +734,25 @@ static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
 
     DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
               src6, src7, src8);
-    hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+    hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     out2 = __lsx_vpackev_b(hz_out5, hz_out4);
-    tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
 
-    hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+    hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     out5 = __lsx_vpackev_b(hz_out6, hz_out5);
-    tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+    tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
 
-    hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+    hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     out7 = __lsx_vpackev_b(hz_out7, hz_out6);
-    tmp2 = DPADD_H3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+    tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
 
-    hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+    hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     out6 = __lsx_vpackev_b(hz_out8, hz_out7);
-    tmp3 = DPADD_H3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+    tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
 
     DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
               VP8_FILTER_SHIFT, vec0, vec1);
@@ -997,14 +997,14 @@ static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
     DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
     src4332 = __lsx_vilvl_d(src43_r, src32_r);
     src4332 = __lsx_vxori_b(src4332, 128);
-    out0 = FILT_4TAP_DPADD_H(src2110, src4332, filt0, filt1);
+    out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1);
 
     src2 = __lsx_vld(src, 0);
     src += src_stride;
     DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
     src2110 = __lsx_vilvl_d(src65_r, src54_r);
     src2110 = __lsx_vxori_b(src2110, 128);
-    out1 = FILT_4TAP_DPADD_H(src4332, src2110, filt0, filt1);
+    out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1);
     out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
     out0 = __lsx_vxori_b(out0, 128);
 
@@ -1055,10 +1055,10 @@ static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
               src8, src9, src10);
     DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
               src72_r, src87_r, src98_r, src109_r);
-    out0_r = FILT_4TAP_DPADD_H(src10_r, src72_r, filt0, filt1);
-    out1_r = FILT_4TAP_DPADD_H(src21_r, src87_r, filt0, filt1);
-    out2_r = FILT_4TAP_DPADD_H(src72_r, src98_r, filt0, filt1);
-    out3_r = FILT_4TAP_DPADD_H(src87_r, src109_r, filt0, filt1);
+    out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1);
+    out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1);
+    out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1);
+    out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1);
     DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
               out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -1114,14 +1114,14 @@ static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
               src32_r, src43_r, src54_r, src65_r);
     DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
               src32_l, src43_l, src54_l, src65_l);
-    out0_r = FILT_4TAP_DPADD_H(src10_r, src32_r, filt0, filt1);
-    out1_r = FILT_4TAP_DPADD_H(src21_r, src43_r, filt0, filt1);
-    out2_r = FILT_4TAP_DPADD_H(src32_r, src54_r, filt0, filt1);
-    out3_r = FILT_4TAP_DPADD_H(src43_r, src65_r, filt0, filt1);
-    out0_l = FILT_4TAP_DPADD_H(src10_l, src32_l, filt0, filt1);
-    out1_l = FILT_4TAP_DPADD_H(src21_l, src43_l, filt0, filt1);
-    out2_l = FILT_4TAP_DPADD_H(src32_l, src54_l, filt0, filt1);
-    out3_l = FILT_4TAP_DPADD_H(src43_l, src65_l, filt0, filt1);
+    out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1);
+    out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1);
+    out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1);
+    out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1);
+    out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1);
+    out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1);
+    out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1);
+    out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1);
     DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
               out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
               out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
@@ -1168,8 +1168,8 @@ static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
 
   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
   src2 = __lsx_vxori_b(src2, 128);
-  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
   vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
 
   DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
@@ -1182,16 +1182,16 @@ static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
     src += src_stride_x4;
 
     DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
-    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
     hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
     vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
-    tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
 
     DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
-    hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
     hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
     vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
-    tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+    tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
 
     tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
     tmp0 = __lsx_vxori_b(tmp0, 128);
@@ -1239,9 +1239,9 @@ static inline void common_hv_4ht_4vt_8w_lsx(
 
   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
   src2 = __lsx_vxori_b(src2, 128);
-  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
 
   filt = __lsx_vld(filter_vert, 0);
@@ -1254,21 +1254,21 @@ static inline void common_hv_4ht_4vt_8w_lsx(
 
     DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
               src4, src5, src6);
-    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
     vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
-    tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
 
-    hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
     vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
-    tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1);
+    tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
 
-    hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
     vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
-    tmp2 = FILT_4TAP_DPADD_H(vec1, vec4, filt_vt0, filt_vt1);
+    tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1);
 
-    hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
-    tmp3 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+    tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
 
     DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -1324,9 +1324,9 @@ static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
   src2 = __lsx_vxori_b(src2, 128);
 
-  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
-  hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
   vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
 
@@ -1341,17 +1341,17 @@ static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
     DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
               src4, src5, src6);
 
-    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+    hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
     vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
-    tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
 
-    hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+    hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
     vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
-    tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+    tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
 
     DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -1402,11 +1402,11 @@ static inline void common_hv_6ht_4vt_8w_lsx(
 
   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
   src2 = __lsx_vxori_b(src2, 128);
-  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
-  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
-  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+  hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
                             filt_hz2);
   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
 
@@ -1420,25 +1420,25 @@ static inline void common_hv_6ht_4vt_8w_lsx(
     DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
               src4, src5, src6);
 
-    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+    hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
-    tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
 
-    hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+    hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
-    tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1);
+    tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
 
-    hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+    hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
-    tmp2 = FILT_4TAP_DPADD_H(vec1, vec0, filt_vt0, filt_vt1);
+    tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1);
 
-    hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+    hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
                               filt_hz1, filt_hz2);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
-    tmp3 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+    tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
 
     DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -1492,9 +1492,9 @@ static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
             src1, src2, src3);
   src4 = __lsx_vxori_b(src4, 128);
-  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
   hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
 
@@ -1510,15 +1510,15 @@ static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
               src6, src7, src8);
     src += src_stride_x4;
 
-    hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
     hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
     out2 = __lsx_vpackev_b(hz_out5, hz_out4);
-    tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
 
-    hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
     hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
     out3 = __lsx_vpackev_b(hz_out7, hz_out6);
-    tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+    tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
 
     tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
     tmp0 = __lsx_vxori_b(tmp0, 128);
@@ -1571,11 +1571,11 @@ static inline void common_hv_4ht_6vt_8w_lsx(
   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
             src1, src2, src3);
   src4 = __lsx_vxori_b(src4, 128);
-  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
-  hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
   DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
 
@@ -1590,21 +1590,21 @@ static inline void common_hv_4ht_6vt_8w_lsx(
 
     DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
               src6, src7, src8);
-    hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
     out2 = __lsx_vpackev_b(hz_out5, hz_out4);
-    tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
 
-    hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
     out5 = __lsx_vpackev_b(hz_out6, hz_out5);
-    tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+    tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
 
-    hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
     out6 = __lsx_vpackev_b(hz_out7, hz_out6);
-    tmp2 = DPADD_H3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+    tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
 
-    hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
     out7 = __lsx_vpackev_b(hz_out8, hz_out7);
-    tmp3 = DPADD_H3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+    tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
     DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
     DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
     __lsx_vstelm_d(vec0, dst, 0, 0);
diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
index 4834f18fc0..b0db1e99c5 100644
--- a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
+++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -16,33 +16,26 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_util/loongson_intrinsics.h"
 
+static INLINE __m128i load_tran_low(const tran_low_t *s) {
 #if CONFIG_VP9_HIGHBITDEPTH
-#define load_tran_low(s)                      \
-  ({                                          \
-    __m128i res0_m;                           \
-    __m128i v0_m = __lsx_vld(s, 0);           \
-    __m128i v1_m = __lsx_vld(s + 4, 0);       \
-    res0_m = __lsx_vsrlni_h_w(v0_m, v1_m, 0); \
-    res0_m;                                   \
-  })
-
-#define store_tran_low(v, s, c)     \
-  {                                 \
-    __m128i v0_m, v1_m;             \
-    v1_m = __lsx_vexth_w_h(v);      \
-    v0_m = __lsx_vsllwil_w_h(v, 0); \
-    __lsx_vst(v0_m, s + c, 0);      \
-    __lsx_vst(v1_m, s + c + 4, 0);  \
-  }
+  __m128i v0_m = __lsx_vld(s, 0);
+  __m128i v1_m = __lsx_vld(s + 4, 0);
+  return __lsx_vsrlni_h_w(v0_m, v1_m, 0);
 #else
-#define load_tran_low(s)      \
-  ({                          \
-    __m128i res0_m;           \
-    res0_m = __lsx_vld(s, 0); \
-    res0_m;                   \
-  })
+  return __lsx_vld(s, 0);
+#endif
+}
 
-#define store_tran_low(v, s, c) __lsx_vst(v, s + c, 0)
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i v0_m, v1_m;
+  v1_m = __lsx_vexth_w_h(v);
+  v0_m = __lsx_vsllwil_w_h(v, 0);
+  __lsx_vst(v0_m, s + c, 0);
+  __lsx_vst(v1_m, s + c + 4, 0);
+#else
+  __lsx_vst(v, s + c, 0);
+#endif
+}
 
 #endif  // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
index e5c301b2c1..9bb3877212 100644
--- a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
+++ b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
@@ -13,10 +13,10 @@
 #include "vpx_dsp/fwd_txfm.h"
 
 #define UNPCK_SH_SW(in, out0, out1)  \
-  {                                  \
+  do {                               \
     out0 = __lsx_vsllwil_w_h(in, 0); \
     out1 = __lsx_vexth_w_h(in);      \
-  }
+  } while (0)
 
 static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
                                               int32_t src_stride,
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c
index 6f2d4d6fee..508532b9d8 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.c
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -12,7 +12,7 @@
 #include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
 
 #define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-  {                                                                            \
+  do {                                                                         \
     __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
                                                                                \
     DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
@@ -23,7 +23,7 @@
     _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
     DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
     DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
-  }
+  } while (0)
 
 #if !CONFIG_VP9_HIGHBITDEPTH
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
index d04427a6ea..4a9fce9a3d 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.h
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -15,7 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 
 #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                 \
-  {                                                                           \
+  do {                                                                        \
     __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m;                               \
     __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                   \
     __m128i vec4_m, vec5_m, vec6_m, vec7_m;                                   \
@@ -38,11 +38,11 @@
     DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m,     \
               vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
               vec7_m, DCT_CONST_BITS, out0, out2, out1, out3);                \
-  }
+  } while (0)
 
 #define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
                   out3, out4, out5, out6, out7)                             \
-  {                                                                         \
+  do {                                                                      \
     __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                       \
     __m128i s7_m, x0_m, x1_m, x2_m, x3_m;                                   \
     __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };           \
@@ -97,10 +97,10 @@
     x3_m = __lsx_vneg_h(x3_m);                                              \
     x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
     DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                          \
-  }
+  } while (0)
 
 #define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)             \
-  {                                                                         \
+  do {                                                                      \
     __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
                                                                             \
     DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m,    \
@@ -111,10 +111,10 @@
               in3, in0, in1, in2, in3);                                     \
     DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m,  \
               in7, in4, in5, in6, in7);                                     \
-  }
+  } while (0)
 
 #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
-  {                                          \
+  do {                                       \
     __m128i tp0_m, tp1_m;                    \
     __m128i one = __lsx_vreplgr2vr_h(1);     \
                                              \
@@ -130,10 +130,10 @@
     vec1 = __lsx_vadd_h(vec1, tp1_m);        \
     vec0 = __lsx_vsrai_h(vec0, 2);           \
     vec1 = __lsx_vsrai_h(vec1, 2);           \
-  }
+  } while (0)
 
 #define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
-  {                                        \
+  do {                                     \
     __m128i tp0_m, tp1_m;                  \
     __m128i one_m = __lsx_vldi(0x401);     \
                                            \
@@ -147,10 +147,10 @@
     vec1 = __lsx_vadd_h(vec1, tp1_m);      \
     vec0 = __lsx_vsrai_h(vec0, 2);         \
     vec1 = __lsx_vsrai_h(vec1, 2);         \
-  }
+  } while (0)
 
 #define FDCT32_POSTPROC_NEG_W(vec)         \
-  {                                        \
+  do {                                     \
     __m128i temp_m;                        \
     __m128i one_m = __lsx_vreplgr2vr_w(1); \
                                            \
@@ -159,11 +159,11 @@
     temp_m = __lsx_vand_v(one_m, temp_m);  \
     vec = __lsx_vadd_w(vec, temp_m);       \
     vec = __lsx_vsrai_w(vec, 2);           \
-  }
+  } while (0)
 
 #define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right,       \
                           const0, const1, out0, out1, out2, out3)             \
-  {                                                                           \
+  do {                                                                        \
     __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                   \
     __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1;                         \
     __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0);                       \
@@ -188,11 +188,11 @@
     DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m);          \
     DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
               DCT_CONST_BITS, out2, out3);                                    \
-  }
+  } while (0)
 
 #define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2,   \
                             in3)                                               \
-  {                                                                            \
+  do {                                                                         \
     __m128i dst0_m, dst1_m, dst2_m, dst3_m;                                    \
     __m128i tmp0_m, tmp1_m;                                                    \
     __m128i res0_m, res1_m, res2_m, res3_m;                                    \
@@ -210,11 +210,11 @@
     __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1);                               \
     __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0);                              \
     __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1);                              \
-  }
+  } while (0)
 
 #define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                       out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
+  do {                                                                    \
     __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
     __m128i x0_m, x1_m, x2_m, x3_m;                                       \
     __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };         \
@@ -270,12 +270,12 @@
     x3_m = __lsx_vneg_h(x3_m);                                            \
     x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
     DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                        \
-  }
+  } while (0)
 
 #define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,  \
                      input7, out1, out3, out5, out7, out9, out11, out13,      \
                      out15)                                                   \
-  {                                                                           \
+  do {                                                                        \
     __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;             \
     __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;             \
     __m128i stp36_m, stp37_m, vec0_m, vec1_m;                                 \
@@ -373,7 +373,7 @@
     cnst1_m = __lsx_vreplvei_h(coeff2_m, 3);                                  \
     cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3);                     \
-  }
+  } while (0)
 
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride);
diff --git a/vpx_dsp/loongarch/idct32x32_lsx.c b/vpx_dsp/loongarch/idct32x32_lsx.c
index d6890c28e1..ec07f57d90 100644
--- a/vpx_dsp/loongarch/idct32x32_lsx.c
+++ b/vpx_dsp/loongarch/idct32x32_lsx.c
@@ -12,10 +12,10 @@
 #include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
 
 #define UNPCK_UB_SH(_in, _out0, _out1)   \
-  {                                      \
+  do {                                   \
     _out0 = __lsx_vsllwil_hu_bu(_in, 0); \
     _out1 = __lsx_vexth_hu_bu(_in);      \
-  }
+  } while (0)
 
 static void idct32x8_row_transpose_store(const int16_t *input,
                                          int16_t *tmp_buf) {
diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c
index cbaefcd6e0..539817777d 100644
--- a/vpx_dsp/loongarch/loopfilter_16_lsx.c
+++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -15,7 +15,7 @@
 
 #define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \
                  _in2, _in3, _in4, _in5, _in6, _in7)                      \
-  {                                                                       \
+  do {                                                                    \
     _in0 = __lsx_vld(_src, 0);                                            \
     _in1 = __lsx_vldx(_src, _stride);                                     \
     _in2 = __lsx_vldx(_src, _stride2);                                    \
@@ -25,11 +25,11 @@
     _in5 = __lsx_vldx(_src, _stride);                                     \
     _in6 = __lsx_vldx(_src, _stride2);                                    \
     _in7 = __lsx_vldx(_src, _stride3);                                    \
-  }
+  } while (0)
 
 #define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \
                  _stride, _stride2, _stride3, _stride4)                        \
-  {                                                                            \
+  do {                                                                         \
     __lsx_vst(_dst0, _dst, 0);                                                 \
     __lsx_vstx(_dst1, _dst, _stride);                                          \
     __lsx_vstx(_dst2, _dst, _stride2);                                         \
@@ -39,7 +39,7 @@
     __lsx_vstx(_dst5, _dst, _stride);                                          \
     __lsx_vstx(_dst6, _dst, _stride2);                                         \
     __lsx_vstx(_dst7, _dst, _stride3);                                         \
-  }
+  } while (0)
 
 static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
                                     uint8_t *filter48,
diff --git a/vpx_dsp/loongarch/loopfilter_lsx.h b/vpx_dsp/loongarch/loopfilter_lsx.h
index 53e15fe6d5..1c43836503 100644
--- a/vpx_dsp/loongarch/loopfilter_lsx.h
+++ b/vpx_dsp/loongarch/loopfilter_lsx.h
@@ -16,7 +16,7 @@
 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
                      limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
                      flat_out)                                               \
-  {                                                                          \
+  do {                                                                       \
     __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
     __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
                                                                              \
@@ -47,10 +47,10 @@
                                                                              \
     mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
     mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
-  }
+  } while (0)
 
 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)          \
-  {                                                                            \
+  do {                                                                         \
     __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0;                    \
     __m128i flat4_tmp = __lsx_vldi(1);                                         \
                                                                                \
@@ -64,11 +64,11 @@
     flat_out = __lsx_vslt_bu(flat4_tmp, flat_out);                             \
     flat_out = __lsx_vxori_b(flat_out, 0xff);                                  \
     flat_out = flat_out & (mask);                                              \
-  }
+  } while (0)
 
 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in,      \
                   q6_in, q7_in, flat_in, flat2_out)                            \
-  {                                                                            \
+  do {                                                                         \
     __m128i flat5_tmp = __lsx_vldi(1);                                         \
     __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0;                    \
     __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0;                    \
@@ -87,11 +87,11 @@
     flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out);                           \
     flat2_out = __lsx_vxori_b(flat2_out, 0xff);                                \
     flat2_out = flat2_out & flat_in;                                           \
-  }
+  } while (0)
 
 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out,  \
                            p0_out, q0_out, q1_out)                         \
-  {                                                                        \
+  do {                                                                     \
     __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;               \
     const __m128i cnst4b = __lsx_vldi(4);                                  \
     const __m128i cnst3b = __lsx_vldi(3);                                  \
@@ -118,12 +118,12 @@
     q1_m = __lsx_vssub_b(q1_m, filt);                                      \
     p1_m = __lsx_vsadd_b(p1_m, filt);                                      \
     DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out);      \
-  }
+  } while (0)
 
 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
                     p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
                     q1_filt8_out, q2_filt8_out)                             \
-  {                                                                         \
+  do {                                                                      \
     __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                          \
                                                                             \
     tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in);                               \
@@ -162,6 +162,6 @@
     tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in);                               \
     tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
     q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
-  }
+  } while (0)
 
 #endif  // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c
index e3fbb9e9e0..2fc33b06b7 100644
--- a/vpx_dsp/loongarch/quantize_lsx.c
+++ b/vpx_dsp/loongarch/quantize_lsx.c
@@ -12,79 +12,83 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_util/loongson_intrinsics.h"
 
-#define CALCULATE_QCOEFF(coeff, coeff_abs, round, quant, shift, cmp_mask) \
-  ({                                                                      \
-    __m128i rounded, qcoeff;                                              \
-                                                                          \
-    rounded = __lsx_vsadd_h(coeff_abs, round);                            \
-    qcoeff = __lsx_vmuh_h(rounded, quant);                                \
-    qcoeff = __lsx_vadd_h(rounded, qcoeff);                               \
-    qcoeff = __lsx_vmuh_h(qcoeff, shift);                                 \
-    qcoeff = __lsx_vsigncov_h(coeff, qcoeff);                             \
-    qcoeff = __lsx_vand_v(qcoeff, cmp_mask);                              \
-                                                                          \
-    qcoeff;                                                               \
-  })
-
-#define CALCULATE_DQCOEFF_AND_STORE(qcoeff, dequant, dqcoeff) \
-  {                                                           \
-    __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);        \
-    __lsx_vst(dqcoeff16, dqcoeff, 0);                         \
-  }
+static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
+                                       __m128i round, __m128i quant,
+                                       __m128i shift, __m128i cmp_mask) {
+  __m128i rounded, qcoeff;
+
+  rounded = __lsx_vsadd_h(coeff_abs, round);
+  qcoeff = __lsx_vmuh_h(rounded, quant);
+  qcoeff = __lsx_vadd_h(rounded, qcoeff);
+  qcoeff = __lsx_vmuh_h(qcoeff, shift);
+  qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
+  qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
+
+  return qcoeff;
+}
 
-#define CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff, dequant, dqcoeff) \
-  {                                                                 \
-    __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;               \
-    __m128i zero = __lsx_vldi(0);                                   \
-    __m128i coeff = __lsx_vabsd_h(qcoeff, zero);                    \
-                                                                    \
-    __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);                   \
-    __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);                   \
-                                                                    \
-    low = __lsx_vmul_h(coeff, dequant);                             \
-    high = __lsx_vmuh_h(coeff, dequant);                            \
-    dqcoeff32_0 = __lsx_vilvl_h(high, low);                         \
-    dqcoeff32_1 = __lsx_vilvh_h(high, low);                         \
-                                                                    \
-    dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);                    \
-    dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);                    \
-    dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);            \
-    dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);            \
-    res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);                \
-    __lsx_vst(res, dqcoeff, 0);                                     \
-  }
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               int16_t *dqcoeff) {
+  __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
+  __lsx_vst(dqcoeff16, dqcoeff, 0);
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
+                                                     __m128i dequant,
+                                                     int16_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
+  __m128i zero = __lsx_vldi(0);
+  __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
+
+  const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
+  const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
+
+  low = __lsx_vmul_h(coeff, dequant);
+  high = __lsx_vmuh_h(coeff, dequant);
+  dqcoeff32_0 = __lsx_vilvl_h(high, low);
+  dqcoeff32_1 = __lsx_vilvh_h(high, low);
+
+  // "Divide" by 2.
+  dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
+  dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
+  dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
+  dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
+  res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
+  __lsx_vst(res, dqcoeff, 0);
+}
+
+static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
+                                   __m128i zbin_mask0, __m128i zbin_mask1,
+                                   const int16_t *scan, int index,
+                                   __m128i zero) {
+  const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
+  const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
+  __m128i scan0 = __lsx_vld(scan + index, 0);
+  __m128i scan1 = __lsx_vld(scan + index + 8, 0);
+  __m128i eob0, eob1;
+
+  scan0 = __lsx_vsub_h(scan0, zbin_mask0);
+  scan1 = __lsx_vsub_h(scan1, zbin_mask1);
+  eob0 = __lsx_vandn_v(zero_coeff0, scan0);
+  eob1 = __lsx_vandn_v(zero_coeff1, scan1);
+  return __lsx_vmax_h(eob0, eob1);
+}
 
-#define SCAN_FOR_EOB(coeff0, coeff1, zbin_mask0, zbin_mask1, scan, index, \
-                     zero)                                                \
-  ({                                                                      \
-    __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);                     \
-    __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);                     \
-    __m128i scan0 = __lsx_vld(scan + index, 0);                           \
-    __m128i scan1 = __lsx_vld(scan + index + 8, 0);                       \
-    __m128i eob0, eob1, eob_max;                                          \
-                                                                          \
-    scan0 = __lsx_vsub_h(scan0, zbin_mask0);                              \
-    scan1 = __lsx_vsub_h(scan1, zbin_mask1);                              \
-    eob0 = __lsx_vandn_v(zero_coeff0, scan0);                             \
-    eob1 = __lsx_vandn_v(zero_coeff1, scan1);                             \
-    eob_max = __lsx_vmax_h(eob0, eob1);                                   \
-    eob_max;                                                              \
-  })
-
-#define ACCUMULATE_EOB(eob)                   \
-  ({                                          \
-    __m128i eob_shuffled;                     \
-    int16_t res_m;                            \
-                                              \
-    eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); \
-    eob = __lsx_vmax_h(eob, eob_shuffled);    \
-    eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); \
-    eob = __lsx_vmax_h(eob, eob_shuffled);    \
-    eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); \
-    eob = __lsx_vmax_h(eob, eob_shuffled);    \
-    res_m = __lsx_vpickve2gr_h(eob, 1);       \
-    res_m;                                    \
-  })
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  int16_t res_m;
+
+  eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  res_m = __lsx_vpickve2gr_h(eob, 1);
+
+  return res_m;
+}
 
 #if !CONFIG_VP9_HIGHBITDEPTH
 void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
@@ -120,21 +124,21 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
 
   qcoeff0 =
-      CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+      calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
   round = __lsx_vilvh_d(round, round);
   quant = __lsx_vilvh_d(quant, quant);
   quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
   qcoeff1 =
-      CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+      calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
 
   __lsx_vst(qcoeff0, qcoeff_ptr, 0);
   __lsx_vst(qcoeff1, qcoeff_ptr, 16);
 
-  CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr);
+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
   dequant = __lsx_vilvh_d(dequant, dequant);
-  CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + 8);
+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   // AC only loop.
   while (index < n_coeffs) {
     coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -147,24 +151,24 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
     cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
 
     qcoeff0 =
-        CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+        calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
     qcoeff1 =
-        CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+        calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
 
     __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
     __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
 
-    CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr + index);
-    CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
                         zero);
     eob = __lsx_vmax_h(eob, eob0);
 
     index += 16;
   }
 
-  *eob_ptr = ACCUMULATE_EOB(eob);
+  *eob_ptr = accumulate_eob(eob);
 }
 
 void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
@@ -204,20 +208,20 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
 
   qcoeff0 =
-      CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+      calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
   // remove DC in quant_shift, quant, quant_shift
   round = __lsx_vilvh_d(round, round);
   quant = __lsx_vilvh_d(quant, quant);
   quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
   qcoeff1 =
-      CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+      calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
   __lsx_vst(qcoeff0, qcoeff_ptr, 0);
   __lsx_vst(qcoeff1, qcoeff_ptr, 16);
 
-  CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr);
+  calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
   dequant = __lsx_vilvh_d(dequant, dequant);
-  CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
-  eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
+  eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   // AC only loop.
   for (index = 16; index < 32 * 32; index += 16) {
     coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -230,20 +234,20 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
     cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
 
     qcoeff0 =
-        CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+        calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
     qcoeff1 =
-        CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+        calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
     __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
     __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
 
-    CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
-    CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant,
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
                                       dqcoeff_ptr + 8 + index);
-    eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
                         zero);
     eob = __lsx_vmax_h(eob, eob0);
   }
 
-  *eob_ptr = ACCUMULATE_EOB(eob);
+  *eob_ptr = accumulate_eob(eob);
 }
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
index 5eaebfb518..b6fbedb0d0 100644
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -8,59 +8,63 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_util/loongson_intrinsics.h"
 
-#define SAD_UB2_UH(in0, in1, ref0, ref1)           \
-  ({                                               \
-    __m128i diff0_m, diff1_m, sad_m0;              \
-    __m128i sad_m = __lsx_vldi(0);                 \
-                                                   \
-    diff0_m = __lsx_vabsd_bu(in0, ref0);           \
-    diff1_m = __lsx_vabsd_bu(in1, ref1);           \
-                                                   \
-    sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); \
-    sad_m = __lsx_vadd_h(sad_m, sad_m0);           \
-    sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); \
-    sad_m = __lsx_vadd_h(sad_m, sad_m0);           \
-                                                   \
-    sad_m;                                         \
-  })
-
-#define HADD_UW_U32(in)                          \
-  ({                                             \
-    __m128i res0_m;                              \
-    uint32_t sum_m;                              \
-    res0_m = __lsx_vhaddw_du_wu(in, in);         \
-    res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); \
-    sum_m = __lsx_vpickve2gr_w(res0_m, 0);       \
-    sum_m;                                       \
-  })
-
-#define HADD_UH_U32(in)                 \
-  ({                                    \
-    __m128i res_m;                      \
-    uint32_t sum_m;                     \
-    res_m = __lsx_vhaddw_wu_hu(in, in); \
-    sum_m = HADD_UW_U32(res_m);         \
-    sum_m;                              \
-  })
-
-#define HADD_SW_S32(in)                        \
-  ({                                           \
-    __m128i res0_m;                            \
-    int32_t sum_m;                             \
-                                               \
-    res0_m = __lsx_vhaddw_d_w(in, in);         \
-    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
-    sum_m = __lsx_vpickve2gr_w(res0_m, 0);     \
-    sum_m;                                     \
-  })
+static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
+                                 __m128i ref1) {
+  __m128i diff0_m, diff1_m, sad_m0;
+  __m128i sad_m = __lsx_vldi(0);
+
+  diff0_m = __lsx_vabsd_bu(in0, ref0);
+  diff1_m = __lsx_vabsd_bu(in1, ref1);
+
+  sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
+  sad_m = __lsx_vadd_h(sad_m, sad_m0);
+  sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
+  sad_m = __lsx_vadd_h(sad_m, sad_m0);
+
+  return sad_m;
+}
+
+static INLINE uint32_t hadd_uw_u32(__m128i in) {
+  __m128i res0_m;
+  uint32_t sum_m;
+
+  res0_m = __lsx_vhaddw_du_wu(in, in);
+  res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
+  sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+  return sum_m;
+}
+
+static INLINE uint32_t hadd_uh_u32(__m128i in) {
+  __m128i res_m;
+  uint32_t sum_m;
+
+  res_m = __lsx_vhaddw_wu_hu(in, in);
+  sum_m = hadd_uw_u32(res_m);
+
+  return sum_m;
+}
+
+static INLINE int32_t hadd_sw_s32(__m128i in) {
+  __m128i res0_m;
+  int32_t sum_m;
+
+  res0_m = __lsx_vhaddw_d_w(in, in);
+  res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
+  sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+  return sum_m;
+}
 
 static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
                                const uint8_t *ref, int32_t ref_stride,
                                int32_t height) {
   int32_t ht_cnt;
+  uint32_t res;
   __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
   __m128i sad = __lsx_vldi(0);
 
@@ -79,16 +83,18 @@ static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
     ref += ref_stride;
     DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
               src0, src1, ref0, ref1);
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad = __lsx_vadd_h(sad, sad_tmp);
   }
-  return HADD_UH_U32(sad);
+  res = hadd_uh_u32(sad);
+  return res;
 }
 
 static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
                                 const uint8_t *ref, int32_t ref_stride,
                                 int32_t height) {
   int32_t ht_cnt = (height >> 2);
+  uint32_t res;
   __m128i src0, src1, ref0, ref1, sad_tmp;
   __m128i sad = __lsx_vldi(0);
   int32_t src_stride2 = src_stride << 1;
@@ -99,23 +105,26 @@ static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
     DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
     src += src_stride2;
     ref += ref_stride2;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad = __lsx_vadd_h(sad, sad_tmp);
 
     DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
     DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
     src += src_stride2;
     ref += ref_stride2;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad = __lsx_vadd_h(sad, sad_tmp);
   }
-  return HADD_UH_U32(sad);
+
+  res = hadd_uh_u32(sad);
+  return res;
 }
 
 static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
                                 const uint8_t *ref, int32_t ref_stride,
                                 int32_t height) {
   int32_t ht_cnt = (height >> 2);
+  uint32_t res;
   __m128i src0, src1, ref0, ref1;
   __m128i sad_tmp;
   __m128i sad = __lsx_vldi(0);
@@ -125,31 +134,32 @@ static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
     src += src_stride;
     DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
     ref += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad = __lsx_vadd_h(sad, sad_tmp);
 
     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
     src += src_stride;
     DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
     ref += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad = __lsx_vadd_h(sad, sad_tmp);
 
     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
     src += src_stride;
     DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
     ref += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad = __lsx_vadd_h(sad, sad_tmp);
 
     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
     src += src_stride;
     DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
     ref += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad = __lsx_vadd_h(sad, sad_tmp);
   }
-  return HADD_UH_U32(sad);
+  res = hadd_uh_u32(sad);
+  return res;
 }
 
 static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
@@ -170,9 +180,9 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
     DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
               ref3);
     ref += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad0 = __lsx_vadd_h(sad0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
     sad1 = __lsx_vadd_h(sad1, sad_tmp);
 
     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
@@ -181,14 +191,14 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
     DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
               ref3);
     ref += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad0 = __lsx_vadd_h(sad0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
     sad1 = __lsx_vadd_h(sad1, sad_tmp);
   }
 
-  sad = HADD_UH_U32(sad0);
-  sad += HADD_UH_U32(sad1);
+  sad = hadd_uh_u32(sad0);
+  sad += hadd_uh_u32(sad1);
 
   return sad;
 }
@@ -247,25 +257,25 @@ static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
 
     DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
     DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad0 = __lsx_vadd_h(sad0, sad_tmp);
 
     DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad1 = __lsx_vadd_h(sad1, sad_tmp);
 
     DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad2 = __lsx_vadd_h(sad2, sad_tmp);
 
     DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad3 = __lsx_vadd_h(sad3, sad_tmp);
   }
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
 }
 
 static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
@@ -334,10 +344,10 @@ static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
     sad3 = __lsx_vadd_h(sad3, sad_tmp);
   }
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
 }
 
 static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
@@ -363,28 +373,28 @@ static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
 
     DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
     ref0_ptr += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad0 = __lsx_vadd_h(sad0, sad_tmp);
 
     DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
     ref1_ptr += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad1 = __lsx_vadd_h(sad1, sad_tmp);
 
     DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
     ref2_ptr += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad2 = __lsx_vadd_h(sad2, sad_tmp);
 
     DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
     ref3_ptr += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad3 = __lsx_vadd_h(sad3, sad_tmp);
   }
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
 }
 
 static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
@@ -419,60 +429,60 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
     DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
               ref0, ref1, ref2, ref3);
     ref0_ptr += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
     sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
 
     DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
               ref0, ref1, ref2, ref3);
     ref1_ptr += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
     sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
 
     DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
               ref0, ref1, ref2, ref3);
     ref2_ptr += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
     sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
 
     DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
               ref0, ref1, ref2, ref3);
     ref3_ptr += ref_stride;
-    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
     sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
     sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
   }
   sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
   sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
   sad = __lsx_vadd_w(sad, sad_tmp);
-  sad_array[0] = HADD_UW_U32(sad);
+  sad_array[0] = hadd_uw_u32(sad);
 
   sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
   sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
   sad = __lsx_vadd_w(sad, sad_tmp);
-  sad_array[1] = HADD_UW_U32(sad);
+  sad_array[1] = hadd_uw_u32(sad);
 
   sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
   sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
   sad = __lsx_vadd_w(sad, sad_tmp);
-  sad_array[2] = HADD_UW_U32(sad);
+  sad_array[2] = hadd_uw_u32(sad);
 
   sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
   sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
   sad = __lsx_vadd_w(sad, sad_tmp);
-  sad_array[3] = HADD_UW_U32(sad);
+  sad_array[3] = hadd_uw_u32(sad);
 }
 
 static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
                                    const uint8_t *ref, int32_t ref_stride,
                                    int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt = (height >> 2);
+  int32_t res, ht_cnt = (height >> 2);
   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
   __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
   __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
@@ -514,26 +524,26 @@ static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
     sec_pred += 128;
 
     DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
-    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
     sad = __lsx_vadd_h(sad, sad_tmp);
     DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
-    sad_tmp = SAD_UB2_UH(src2, src3, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
     sad = __lsx_vadd_h(sad, sad_tmp);
     DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
-    sad_tmp = SAD_UB2_UH(src4, src5, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
     sad = __lsx_vadd_h(sad, sad_tmp);
     DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
-    sad_tmp = SAD_UB2_UH(src6, src7, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
     sad = __lsx_vadd_h(sad, sad_tmp);
   }
-
-  return HADD_UH_U32(sad);
+  res = hadd_uh_u32(sad);
+  return res;
 }
 
 static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
                                    const uint8_t *ref, int32_t ref_stride,
                                    int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt = (height >> 2);
+  int32_t res, ht_cnt = (height >> 2);
   __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
   __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
   __m128i sad, sad_tmp;
@@ -552,9 +562,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
     sec_pred += 64;
     DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
               ref3, comp0, comp1, comp2, comp3);
-    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
     sad0 = __lsx_vadd_h(sad0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
     sad1 = __lsx_vadd_h(sad1, sad_tmp);
 
     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
@@ -568,9 +578,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
     sec_pred += 64;
     DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
               ref3, comp0, comp1, comp2, comp3);
-    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
     sad0 = __lsx_vadd_h(sad0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
     sad1 = __lsx_vadd_h(sad1, sad_tmp);
 
     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
@@ -584,9 +594,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
     sec_pred += 64;
     DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
               ref3, comp0, comp1, comp2, comp3);
-    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
     sad0 = __lsx_vadd_h(sad0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
     sad1 = __lsx_vadd_h(sad1, sad_tmp);
 
     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
@@ -600,16 +610,17 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
     sec_pred += 64;
     DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
               ref3, comp0, comp1, comp2, comp3);
-    sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
     sad0 = __lsx_vadd_h(sad0, sad_tmp);
-    sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
     sad1 = __lsx_vadd_h(sad1, sad_tmp);
   }
   sad = __lsx_vhaddw_wu_hu(sad0, sad0);
   sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
   sad = __lsx_vadd_w(sad, sad_tmp);
 
-  return HADD_SW_S32(sad);
+  res = hadd_sw_s32(sad);
+  return res;
 }
 
 #define VPX_SAD_8xHT_LSX(height)                                             \
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
index 54fcd6c571..d1abf622ad 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -57,13 +57,13 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
   DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
   src6 = __lsx_vxori_b(src6, 128);
 
-  tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+  tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+  tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+  tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+  tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
   DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
   DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
@@ -87,17 +87,17 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
     src2 = __lsx_vilvl_d(src3, src2);
     DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
               src8, src9, src10);
-    tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+    tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
     tmp4 = __lsx_vpackev_b(tmp3, tmp4);
-    out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+    src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     src0 = __lsx_vshuf_b(src1, tmp3, shuff);
     src0 = __lsx_vpackev_b(src1, src0);
-    out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+    out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
     out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
     out0 = __lsx_vxori_b(out0, 128);
@@ -152,19 +152,19 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
   DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
   src6 = __lsx_vxori_b(src6, 128);
 
-  src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+  src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+  src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+  src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+  src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+  src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+  src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+  src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
 
   DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
@@ -181,25 +181,25 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
 
     DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
               src8, src9, src10);
-    src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+    src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     tmp3 = __lsx_vpackev_b(src7, src6);
-    out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+    src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     src0 = __lsx_vpackev_b(src8, src7);
-    out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+    out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+    src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     src1 = __lsx_vpackev_b(src9, src8);
-    src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+    src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+    src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
                             filt_hz1, filt_hz2, filt_hz3);
     src2 = __lsx_vpackev_b(src10, src9);
-    src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+    src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
     DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
               FILTER_BITS, out0, out1);
@@ -296,9 +296,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
             src, src_stride4, src1, src2, src3, src4);
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
   hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
   hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
@@ -348,11 +348,11 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
             src, src_stride4, src5, src6, src7, src8);
   src += src_stride4;
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
-  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
-  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+  hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+  hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
   DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
             hz_out1, hz_out3);
   hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
@@ -449,20 +449,20 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
   dst_tmp += dst_stride;
   dst3 = __lsx_vldrepl_d(dst_tmp, 0);
   DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
   vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
   tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
   vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
   tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
 
-  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
   vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
   tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
   vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
   tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
   DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
@@ -494,7 +494,7 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
   src0 = __lsx_vld(src, 0);
   src += src_stride;
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
 
   for (; loop_cnt--;) {
     src1 = __lsx_vld(src, 0);
@@ -502,19 +502,19 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
     src4 = __lsx_vldx(src, src_stride3);
     src += src_stride4;
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
     tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
     tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
     DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
@@ -571,8 +571,8 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
   DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
   src += src_stride;
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
 
   for (; loop_cnt--;) {
     src0 = __lsx_vld(src, 0);
@@ -588,32 +588,32 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
     DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
     dst3 = __lsx_vldx(dst, dst_stride3);
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
     tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     tmp3 = __lsx_vavgr_bu(tmp3, dst0);
     __lsx_vst(tmp3, dst, 0);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
     tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     tmp3 = __lsx_vavgr_bu(tmp3, dst1);
     __lsx_vstx(tmp3, dst, dst_stride);
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
     tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
     tmp3 = __lsx_vavgr_bu(tmp3, dst2);
     __lsx_vstx(tmp3, dst, dst_stride2);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
     tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
index 584f241838..5c6413df44 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
@@ -68,9 +68,9 @@ static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
               tmp0, tmp1, tmp2, tmp3);
     DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
     DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
-    out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
                                filter2, filter3);
-    out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
+    out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
                                filter2, filter3);
     out0 = __lsx_vssrarni_b_h(out1, out0, 7);
     out0 = __lsx_vxori_b(out0, 128);
@@ -146,13 +146,13 @@ static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
               src8, src9, src10);
     DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
               tmp0, tmp1, tmp2, tmp3);
-    out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
                                filter2, filter3);
-    out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
+    out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
                                filter2, filter3);
-    out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
+    out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
                                filter2, filter3);
-    out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
+    out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
                                filter2, filter3);
     DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -231,13 +231,13 @@ static void common_vt_8t_and_aver_dst_16w_mult_lsx(
                 src0, src1, src2, src3);
       DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
                 src4, src5, src7, src8);
-      tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+      tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
                                  filter2, filter3);
-      tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+      tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
                                  filter2, filter3);
-      tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+      tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
                                  filter2, filter3);
-      tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+      tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
                                  filter2, filter3);
       DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
       DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -246,13 +246,13 @@ static void common_vt_8t_and_aver_dst_16w_mult_lsx(
       DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
       __lsx_vst(tmp0, dst_reg, 0);
       __lsx_vstx(tmp1, dst_reg, dst_stride);
-      tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+      tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
                                  filter2, filter3);
-      tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+      tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
                                  filter2, filter3);
-      tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+      tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
                                  filter2, filter3);
-      tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+      tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
                                  filter2, filter3);
       DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
       DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
index 73583abb98..9f5cd6cfe9 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -54,13 +54,13 @@ static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
   DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
   src6 = __lsx_vxori_b(src6, 128);
 
-  tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+  tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+  tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+  tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+  tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
   DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
   DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
@@ -73,17 +73,17 @@ static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
     src += src_stride;
     DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
               src8, src9, src10);
-    tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+    tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
     tmp4 = __lsx_vpackev_b(tmp3, tmp4);
-    out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+    src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     src0 = __lsx_vshuf_b(src1, tmp3, shuff);
     src0 = __lsx_vpackev_b(src1, src0);
-    out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+    out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
     out0 = __lsx_vssrarni_b_h(out1, out0, 7);
     out0 = __lsx_vxori_b(out0, 128);
@@ -135,19 +135,19 @@ static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
   DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
   src6 = __lsx_vxori_b(src6, 128);
 
-  src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+  src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+  src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+  src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+  src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+  src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+  src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
-  src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+  src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
                          filt_hz1, filt_hz2, filt_hz3);
 
   DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
@@ -161,25 +161,25 @@ static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
     src += src_stride;
     DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
               src8, src9, src10);
-    src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+    src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     tmp3 = __lsx_vpackev_b(src7, src6);
-    out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+    src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     src0 = __lsx_vpackev_b(src8, src7);
-    out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+    out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+    src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
     src1 = __lsx_vpackev_b(src9, src8);
-    src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+    src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+    src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
                             filt_hz1, filt_hz2, filt_hz3);
     src2 = __lsx_vpackev_b(src10, src9);
-    src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+    src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
     DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -267,9 +267,9 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
   src0 = __lsx_vld(src, 0);
   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
             src, src_stride4, src1, src2, src3, src4);
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
 
   hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
   hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
@@ -316,11 +316,11 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
             src, src_stride4, src5, src6, src7, src8);
   src += src_stride4;
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
-  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
-  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+  hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+  hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
 
   DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
             hz_out1, hz_out3);
@@ -382,20 +382,20 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
             src, src_stride4, src1, src2, src3, src4);
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
   vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
   tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
   vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
   tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
 
-  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
   vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
   tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
   vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
   tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
 
@@ -430,7 +430,7 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
   src0 = __lsx_vld(src, 0);
   src += src_stride;
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
 
   for (; loop_cnt--;) {
     src1 = __lsx_vld(src, 0);
@@ -438,19 +438,19 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
     src4 = __lsx_vldx(src, src_stride3);
     src += src_stride4;
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
     tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
     tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
     src1 = __lsx_vld(src, 0);
     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
     src4 = __lsx_vldx(src, src_stride3);
@@ -470,19 +470,19 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
     __lsx_vstelm_d(tmp2, dst, 0, 1);
     dst += dst_stride;
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
     tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
     tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
     tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
 
@@ -534,8 +534,8 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
   DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
   src += src_stride;
 
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
 
   for (; loop_cnt--;) {
     uint8_t *src_tmp0 = src + 8;
@@ -546,32 +546,32 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
     DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
     src += src_stride4;
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
     tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
     __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
     tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
     __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
     tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
     __lsx_vst(tmp, dst, 0);
     dst += dst_stride;
 
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
     tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
index 7e3a95b2fd..6022e43c83 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -52,9 +52,9 @@ static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
               tmp0, tmp1, tmp2, tmp3);
     DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
     DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
-    out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
                                filter2, filter3);
-    out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
+    out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
                                filter2, filter3);
     out0 = __lsx_vssrarni_b_h(out1, out0, 7);
     out0 = __lsx_vxori_b(out0, 128);
@@ -116,13 +116,13 @@ static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
               src8, src9, src10);
     DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
               tmp0, tmp1, tmp2, tmp3);
-    out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
                                filter2, filter3);
-    out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
+    out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
                                filter2, filter3);
-    out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
+    out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
                                filter2, filter3);
-    out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
+    out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
                                filter2, filter3);
     DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -192,13 +192,13 @@ static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
               src0, src1, src2, src3);
     DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
               src4, src5, src7, src8);
-    tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+    tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
                                filter2, filter3);
-    tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+    tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
                                filter2, filter3);
-    tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+    tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
                                filter2, filter3);
-    tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+    tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
                                filter2, filter3);
     DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -206,13 +206,13 @@ static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
     dst += dst_stride;
     __lsx_vst(tmp1, dst, 0);
     dst += dst_stride;
-    tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+    tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
                                filter2, filter3);
-    tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+    tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
                                filter2, filter3);
-    tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+    tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
                                filter2, filter3);
-    tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+    tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
                                filter2, filter3);
     DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -298,25 +298,25 @@ static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
                 src0, src1, src2, src3);
       DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
                 src4, src5, src7, src8);
-      tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+      tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
                                  filter2, filter3);
-      tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+      tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
                                  filter2, filter3);
-      tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+      tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
                                  filter2, filter3);
-      tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+      tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
                                  filter2, filter3);
       DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
       DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
       __lsx_vst(tmp0, dst_tmp, 0);
       __lsx_vstx(tmp1, dst_tmp, dst_stride);
-      tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+      tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
                                  filter2, filter3);
-      tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+      tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
                                  filter2, filter3);
-      tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+      tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
                                  filter2, filter3);
-      tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+      tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
                                  filter2, filter3);
       DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
       DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h
index 2428407f2b..d886b00198 100644
--- a/vpx_dsp/loongarch/vpx_convolve_lsx.h
+++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -11,11 +11,50 @@
 #ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
 #define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
 
-#include "vpx_util/loongson_intrinsics.h"
+#include "./vpx_config.h"
 #include "vpx_dsp/vpx_filter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1,
+                                          __m128i _reg2, __m128i _reg3,
+                                          __m128i _filter0, __m128i _filter1,
+                                          __m128i _filter2, __m128i _filter3) {
+  __m128i _vec0, _vec1;
+
+  _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);
+  _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);
+  _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);
+  _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);
+  return __lsx_vsadd_h(_vec0, _vec1);
+}
+
+static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1,
+                                      __m128i _mask0, __m128i _mask1,
+                                      __m128i _mask2, __m128i _mask3,
+                                      __m128i _filt_h0, __m128i _filt_h1,
+                                      __m128i _filt_h2, __m128i _filt_h3) {
+  __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+  __m128i _out;
+
+  DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,
+            _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);
+  _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1,
+                             _filt_h2, _filt_h3);
+  _out = __lsx_vsrari_h(_out, FILTER_BITS);
+  return __lsx_vsat_h(_out, 7);
+}
+
+static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask,
+                                         __m128i coeff) {
+  __m128i tmp0_m, tmp1_m;
+
+  tmp0_m = __lsx_vshuf_b(in1, in0, mask);
+  tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);
+  return __lsx_vsrari_h(tmp1_m, FILTER_BITS);
+}
 
 #define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
-  {                                                         \
+  do {                                                      \
     _src0 = __lsx_vld(_src, 0);                             \
     _src += _stride;                                        \
     _src1 = __lsx_vld(_src, 0);                             \
@@ -23,43 +62,12 @@
     _src2 = __lsx_vld(_src, 0);                             \
     _src += _stride;                                        \
     _src3 = __lsx_vld(_src, 0);                             \
-  }
-
-#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, \
-                            _filter2, _filter3)                             \
-  ({                                                                        \
-    __m128i _vec0, _vec1;                                                   \
-                                                                            \
-    _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);                                \
-    _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);                      \
-    _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);                                \
-    _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);                      \
-    _vec0 = __lsx_vsadd_h(_vec0, _vec1);                                    \
-                                                                            \
-    _vec0;                                                                  \
-  })
-
-#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3,          \
-                        _filt_h0, _filt_h1, _filt_h2, _filt_h3)                \
-  ({                                                                           \
-    __m128i _tmp0, _tmp1, _tmp2, _tmp3;                                        \
-    __m128i _out;                                                              \
-                                                                               \
-    DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1,       \
-              _src1, _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, \
-              _tmp3);                                                          \
-    _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \
-                               _filt_h2, _filt_h3);                            \
-    _out = __lsx_vsrari_h(_out, FILTER_BITS);                                  \
-    _out = __lsx_vsat_h(_out, 7);                                              \
-                                                                               \
-    _out;                                                                      \
-  })
+  } while (0)
 
 #define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
                                    _mask2, _mask3, _filter0, _filter1,         \
                                    _filter2, _filter3, _out0, _out1)           \
-  {                                                                            \
+  do {                                                                         \
     __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
     __m128i _reg0, _reg1, _reg2, _reg3;                                        \
                                                                                \
@@ -78,12 +86,12 @@
     DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7,         \
               _filter3, _reg2, _reg3);                                         \
     DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1);        \
-  }
+  } while (0)
 
 #define HORIZ_8TAP_8WID_4VECS_FILT(                                            \
     _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0,      \
     _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3)                  \
-  {                                                                            \
+  do {                                                                         \
     __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
     __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7;            \
                                                                                \
@@ -111,22 +119,10 @@
               _reg5, _reg6, _reg7);                                            \
     DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3,  \
               _reg7, _out0, _out1, _out2, _out3);                              \
-  }
-
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
-  ({                                                     \
-    __m128i tmp0_m;                                      \
-    __m128i tmp1_m;                                      \
-                                                         \
-    tmp0_m = __lsx_vshuf_b(in1, in0, mask);              \
-    tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);             \
-    tmp1_m = __lsx_vsrari_h(tmp1_m, shift);              \
-                                                         \
-    tmp1_m;                                              \
-  })
+  } while (0)
 
 #define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride)                \
-  {                                                                  \
+  do {                                                               \
     __m128i tmp0_m, tmp1_m;                                          \
                                                                      \
     DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
@@ -137,6 +133,6 @@
     __lsx_vstelm_d(tmp1_m, pdst, 0, 0);                              \
     pdst += stride;                                                  \
     __lsx_vstelm_d(tmp1_m, pdst, 0, 1);                              \
-  }
+  } while (0)
 
 #endif  // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_

From b163db1a6a3e9c5e544a2c8e43b7a0d299d2b8c6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sun, 15 May 2022 15:39:15 -0700
Subject: [PATCH 303/926] tools/*.py: update to python3

only lint-hunks.py is tested as part of the presubmit; the rest may
need further changes as they're used.

Bug: b/229626362
Change-Id: I2fd6e96deab8d892d34527e484ea65e3df86d162
---
 tools/diff.py            |  2 +-
 tools/intersect-diffs.py |  4 ++--
 tools/lint-hunks.py      | 36 +++++++++++++++++++-----------------
 tools/wrap-commit-msg.py |  2 +-
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/tools/diff.py b/tools/diff.py
index a96c7db851..860a6b051b 100644
--- a/tools/diff.py
+++ b/tools/diff.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
diff --git a/tools/intersect-diffs.py b/tools/intersect-diffs.py
index 4dbafa90b7..590e687b47 100755
--- a/tools/intersect-diffs.py
+++ b/tools/intersect-diffs.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
@@ -69,7 +69,7 @@ def main():
                 break
 
     if out_hunks:
-        print FormatDiffHunks(out_hunks)
+        print(FormatDiffHunks(out_hunks))
         sys.exit(1)
 
 if __name__ == "__main__":
diff --git a/tools/lint-hunks.py b/tools/lint-hunks.py
index 30d3249193..0a94afebb9 100755
--- a/tools/lint-hunks.py
+++ b/tools/lint-hunks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
@@ -10,7 +10,7 @@
 """Performs style checking on each diff hunk."""
 import getopt
 import os
-import StringIO
+import io
 import subprocess
 import sys
 
@@ -63,21 +63,21 @@ def main(argv=None):
     try:
         try:
             opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS)
-        except getopt.error, msg:
+        except getopt.error as msg:
             raise Usage(msg)
 
         # process options
         for o, _ in opts:
             if o in ("-h", "--help"):
-                print __doc__
+                print(__doc__)
                 sys.exit(0)
 
         if args and len(args) > 1:
-            print __doc__
+            print(__doc__)
             sys.exit(0)
 
         # Find the fully qualified path to the root of the tree
-        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE)
+        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE, text=True)
         tl = tl.communicate()[0].strip()
 
         # See if we're working on the index or not.
@@ -93,9 +93,9 @@ def main(argv=None):
 
         # Get a list of all affected lines
         file_affected_line_map = {}
-        p = Subprocess(diff_cmd, stdout=subprocess.PIPE)
+        p = Subprocess(diff_cmd, stdout=subprocess.PIPE, text=True)
         stdout = p.communicate()[0]
-        for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)):
+        for hunk in diff.ParseDiffHunks(io.StringIO(stdout)):
             filename = hunk.right.filename[2:]
             if filename not in file_affected_line_map:
                 file_affected_line_map[filename] = set()
@@ -103,7 +103,7 @@ def main(argv=None):
 
         # Run each affected file through cpplint
         lint_failed = False
-        for filename, affected_lines in file_affected_line_map.iteritems():
+        for filename, affected_lines in file_affected_line_map.items():
             if filename.split(".")[-1] not in ("c", "h", "cc"):
                 continue
             if filename.startswith("third_party"):
@@ -112,14 +112,16 @@ def main(argv=None):
             if args:
                 # File contents come from git
                 show_cmd = SHOW_CMD + [args[0] + ":" + filename]
-                show = Subprocess(show_cmd, stdout=subprocess.PIPE)
+                show = Subprocess(show_cmd, stdout=subprocess.PIPE, text=True)
                 lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
-                                  stdin=show.stdout, stderr=subprocess.PIPE)
+                                  stdin=show.stdout, stderr=subprocess.PIPE,
+                                  text=True)
                 lint_out = lint.communicate()[1]
             else:
                 # File contents come from the working tree
                 lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
-                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE,
+                                  text=True)
                 stdin = open(os.path.join(tl, filename)).read()
                 lint_out = lint.communicate(stdin)[1]
 
@@ -129,17 +131,17 @@ def main(argv=None):
                     continue
                 warning_line_num = int(fields[1])
                 if warning_line_num in affected_lines:
-                    print "%s:%d:%s"%(filename, warning_line_num,
-                                      ":".join(fields[2:]))
+                    print("%s:%d:%s"%(filename, warning_line_num,
+                                      ":".join(fields[2:])))
                     lint_failed = True
 
         # Set exit code if any relevant lint errors seen
         if lint_failed:
             return 1
 
-    except Usage, err:
-        print >>sys.stderr, err
-        print >>sys.stderr, "for help use --help"
+    except Usage as err:
+        print(err, file=sys.stderr)
+        print("for help use --help", file=sys.stderr)
         return 2
 
 if __name__ == "__main__":
diff --git a/tools/wrap-commit-msg.py b/tools/wrap-commit-msg.py
index d5b4b046b1..ba3fa58732 100755
--- a/tools/wrap-commit-msg.py
+++ b/tools/wrap-commit-msg.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license

From c0cee345a36592d162a8c55349a517b84ef5810c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 18 May 2022 19:00:56 -0700
Subject: [PATCH 304/926] y4m_test: check temp file ptr

GetTempOutFile() and TempOutFile::file() may return null if the open
fails

Change-Id: Ib3ee9b592140d30d12aecefa7dfc5f569fa28a34
---
 test/y4m_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 8272263f66..89c6552c5d 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -196,6 +196,7 @@ static const char kY4MRegularHeader[] =
 
 TEST(Y4MHeaderTest, RegularHeader) {
   libvpx_test::TempOutFile f;
+  ASSERT_NE(f.file(), nullptr);
   fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file());
   fflush(f.file());
   EXPECT_EQ(0, fseek(f.file(), 0, 0));
@@ -222,6 +223,7 @@ static const char kY4MLongHeader[] =
 
 TEST(Y4MHeaderTest, LongHeader) {
   libvpx_test::TempOutFile f;
+  ASSERT_NE(f.file(), nullptr);
   fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file());
   fflush(f.file());
   EXPECT_EQ(fseek(f.file(), 0, 0), 0);

From 44874ab879455941c977910daba1b80788f243da Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Wed, 25 May 2022 09:42:35 +0800
Subject: [PATCH 305/926] loongarch: Remove redundant code

Simplify architecture support code and remove redundant code
to improve efficiency.

Bug: webm:1755

Change-Id: I03bc251aca115b0379fe19907abd165e0876355b
---
 build/make/rtcd.pl                            |  15 +-
 vp8/common/loongarch/loopfilter_filters_lsx.c |  12 +-
 vpx_dsp/loongarch/loopfilter_16_lsx.c         | 180 +++++-----
 vpx_dsp/loongarch/loopfilter_4_lsx.c          |  36 +-
 vpx_dsp/loongarch/loopfilter_8_lsx.c          | 315 +++++++++---------
 5 files changed, 262 insertions(+), 296 deletions(-)

diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 8ed776add8..9c97268426 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -494,20 +494,7 @@ ()
   @ALL_ARCHS = filter(qw/vsx/);
   ppc;
 } elsif ($opts{arch} =~ /loongarch/ ) {
-  @ALL_ARCHS = filter("$opts{arch}");
-  open CONFIG_FILE, $opts{config} or
-    die "Error opening config file '$opts{config}': $!\n";
-  while (<CONFIG_FILE>) {
-    if (/HAVE_LSX=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/lsx/);
-      last;
-    }
-    if (/HAVE_LASX=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/lasx/);
-      last;
-    }
-  }
-  close CONFIG_FILE;
+  @ALL_ARCHS = filter(qw/lsx lasx/);
   loongarch;
 } else {
   unoptimized;
diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c
index f743ec0c50..79c3ea6dbb 100644
--- a/vp8/common/loongarch/loopfilter_filters_lsx.c
+++ b/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -172,16 +172,16 @@ static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
   DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2);
   q3 = __lsx_vldx(src, pitch_x3);
 
-  thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
-  thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
   thresh0 = __lsx_vilvl_d(thresh1, thresh0);
 
-  b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr);
-  b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr);
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
   b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
 
-  limit0 = __lsx_vreplgr2vr_b(*limit0_ptr);
-  limit1 = __lsx_vreplgr2vr_b(*limit1_ptr);
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
   limit0 = __lsx_vilvl_d(limit1, limit0);
 
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c
index 539817777d..0503df9966 100644
--- a/vpx_dsp/loongarch/loopfilter_16_lsx.c
+++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -55,7 +55,6 @@ static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
   __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
   __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
   __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
-  __m128i zero = __lsx_vldi(0);
 
   int32_t stride2 = stride << 1;
   int32_t stride3 = stride2 + stride;
@@ -69,9 +68,9 @@ static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
   DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
   q3 = __lsx_vldx(dst, stride3);
 
-  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
-  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
-  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
   /* mask and hev */
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
@@ -87,18 +86,16 @@ static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
     return 1;
   }
 
-  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
-            p1_l, p0_l);
-  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
-            q2_l, q3_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+            p0_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+            q3_l);
 
   VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
               p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
 
-  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h,
-            p1_h, p0_h);
-  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h,
-            q2_h, q3_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
   VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
               p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
 
@@ -135,7 +132,6 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
   uint8_t *dst_tmp1 = dst + stride4;
 
   __m128i flat, flat2, filter8;
-  __m128i zero = __lsx_vldi(0);
   __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
   __m128i out_h, out_l;
   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
@@ -180,16 +176,15 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
   } else {
     dst = dst_tmp0 - stride3;
 
-    p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
-    p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
-    p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
-    p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
-    p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
-    p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
-    p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
-    p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
-
-    q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
+    p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+    p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+    p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+    p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+    p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+    p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+    p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+    p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+    q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
 
     tmp0_l = p7_l_in << 3;
     tmp0_l -= p7_l_in;
@@ -205,16 +200,15 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
 
     out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
 
-    p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
-    p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
-    p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
-    p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
-
-    p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
-    p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
-    p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
-    p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
-    q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
+    p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+    p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+    p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+    p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+    p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+    p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+    p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+    p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+    q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
 
     tmp0_h = p7_h_in << 3;
     tmp0_h -= p7_h_in;
@@ -236,14 +230,14 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     dst += stride;
 
     /* p5 */
-    q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
+    q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
     tmp0_l = p5_l_in - p6_l_in;
     tmp0_l += q1_l_in;
     tmp0_l -= p7_l_in;
     tmp1_l += tmp0_l;
     out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
 
-    q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
+    q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
     tmp0_h = p5_h_in - p6_h_in;
     tmp0_h += q1_h_in;
     tmp0_h -= p7_h_in;
@@ -256,14 +250,14 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     dst += stride;
 
     /* p4 */
-    q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
+    q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
     tmp0_l = p4_l_in - p5_l_in;
     tmp0_l += q2_l_in;
     tmp0_l -= p7_l_in;
     tmp1_l += tmp0_l;
     out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
 
-    q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
+    q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
     tmp0_h = p4_h_in - p5_h_in;
     tmp0_h += q2_h_in;
     tmp0_h -= p7_h_in;
@@ -276,14 +270,14 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     dst += stride;
 
     /* p3 */
-    q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
+    q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
     tmp0_l = p3_l_in - p4_l_in;
     tmp0_l += q3_l_in;
     tmp0_l -= p7_l_in;
     tmp1_l += tmp0_l;
     out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
 
-    q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
+    q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
     tmp0_h = p3_h_in - p4_h_in;
     tmp0_h += q3_h_in;
     tmp0_h -= p7_h_in;
@@ -296,7 +290,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     dst += stride;
 
     /* p2 */
-    q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
+    q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
     filter8 = __lsx_vld(filter48, 0);
     tmp0_l = p2_l_in - p3_l_in;
     tmp0_l += q4_l_in;
@@ -304,7 +298,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     tmp1_l += tmp0_l;
     out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
 
-    q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
+    q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
     tmp0_h = p2_h_in - p3_h_in;
     tmp0_h += q4_h_in;
     tmp0_h -= p7_h_in;
@@ -317,7 +311,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     dst += stride;
 
     /* p1 */
-    q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
+    q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
     filter8 = __lsx_vld(filter48, 16);
     tmp0_l = p1_l_in - p2_l_in;
     tmp0_l += q5_l_in;
@@ -325,7 +319,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     tmp1_l += tmp0_l;
     out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
 
-    q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
+    q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
     tmp0_h = p1_h_in - p2_h_in;
     tmp0_h += q5_h_in;
     tmp0_h -= p7_h_in;
@@ -338,7 +332,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     dst += stride;
 
     /* p0 */
-    q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
+    q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
     filter8 = __lsx_vld(filter48, 32);
     tmp0_l = p0_l_in - p1_l_in;
     tmp0_l += q6_l_in;
@@ -346,7 +340,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     tmp1_l += tmp0_l;
     out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
 
-    q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
+    q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
     tmp0_h = p0_h_in - p1_h_in;
     tmp0_h += q6_h_in;
     tmp0_h -= p7_h_in;
@@ -359,7 +353,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     dst += stride;
 
     /* q0 */
-    q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
+    q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
     filter8 = __lsx_vld(filter48, 48);
     tmp0_l = q7_l_in - p0_l_in;
     tmp0_l += q0_l_in;
@@ -367,7 +361,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
     tmp1_l += tmp0_l;
     out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
 
-    q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
+    q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
     tmp0_h = q7_h_in - p0_h_in;
     tmp0_h += q0_h_in;
     tmp0_h -= p7_h_in;
@@ -534,9 +528,9 @@ static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride,
     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
     q3 = __lsx_vldx(dst, stride3);
 
-    thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
-    b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
-    limit = __lsx_vreplgr2vr_b(*limit_ptr);
+    thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+    b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+    limit = __lsx_vldrepl_b(limit_ptr, 0);
 
     /* filter_mask* */
     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
@@ -850,15 +844,14 @@ static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
   __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
   __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5;
-  __m128i zero = __lsx_vldi(0);
 
   /* load vector elements */
   DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0);
   DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
 
-  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
-  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
-  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
 
   /* mask and hev */
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
@@ -901,16 +894,14 @@ static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
     return 1;
   }
 
-  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
-            p1_l, p0_l);
-  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
-            q2_l, q3_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+            p0_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+            q3_l);
   VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
               p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h,
-            p1_h, p0_h);
-  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h,
-            q2_h, q3_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
   VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
               p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
 
@@ -942,7 +933,6 @@ static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
 
 static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
                               uint8_t *filter48) {
-  __m128i zero = __lsx_vldi(0);
   __m128i flat, flat2, filter8;
   __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
   __m128i out_l, out_h;
@@ -1038,15 +1028,15 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
 
   dst -= 7 * 16;
 
-  p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
-  p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
-  p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
-  p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
-  p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
-  p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
-  p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
-  p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
-  q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
+  p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+  p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+  p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+  p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+  p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+  p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+  p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+  p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+  q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
 
   tmp0_l = p7_l_in << 3;
   tmp0_l -= p7_l_in;
@@ -1060,15 +1050,15 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
   tmp1_l += p0_l_in;
   tmp1_l += tmp0_l;
   out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
-  p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
-  p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
-  p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
-  p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
-  p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
-  p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
-  p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
-  p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
-  q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
+  p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+  p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+  p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+  p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+  p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+  p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+  p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+  p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+  q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
 
   tmp0_h = p7_h_in << 3;
   tmp0_h -= p7_h_in;
@@ -1088,13 +1078,13 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
   __lsx_vst(p6, dst, 0);
 
   /* p5 */
-  q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
+  q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
   tmp0_l = p5_l_in - p6_l_in;
   tmp0_l += q1_l_in;
   tmp0_l -= p7_l_in;
   tmp1_l += tmp0_l;
   out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
-  q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
+  q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
   tmp0_h = p5_h_in - p6_h_in;
   tmp0_h += q1_h_in;
   tmp0_h -= p7_h_in;
@@ -1105,13 +1095,13 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
   __lsx_vst(p5, dst, 16);
 
   /* p4 */
-  q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
+  q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
   tmp0_l = p4_l_in - p5_l_in;
   tmp0_l += q2_l_in;
   tmp0_l -= p7_l_in;
   tmp1_l += tmp0_l;
   out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
-  q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
+  q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
   tmp0_h = p4_h_in - p5_h_in;
   tmp0_h += q2_h_in;
   tmp0_h -= p7_h_in;
@@ -1122,13 +1112,13 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
   __lsx_vst(p4, dst, 16 * 2);
 
   /* p3 */
-  q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
+  q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
   tmp0_l = p3_l_in - p4_l_in;
   tmp0_l += q3_l_in;
   tmp0_l -= p7_l_in;
   tmp1_l += tmp0_l;
   out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
-  q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
+  q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
   tmp0_h = p3_h_in - p4_h_in;
   tmp0_h += q3_h_in;
   tmp0_h -= p7_h_in;
@@ -1139,14 +1129,14 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
   __lsx_vst(p3, dst, 16 * 3);
 
   /* p2 */
-  q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
+  q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
   filter8 = __lsx_vld(filter48, 0);
   tmp0_l = p2_l_in - p3_l_in;
   tmp0_l += q4_l_in;
   tmp0_l -= p7_l_in;
   tmp1_l += tmp0_l;
   out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
-  q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
+  q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
   tmp0_h = p2_h_in - p3_h_in;
   tmp0_h += q4_h_in;
   tmp0_h -= p7_h_in;
@@ -1157,14 +1147,14 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
   __lsx_vst(filter8, dst, 16 * 4);
 
   /* p1 */
-  q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
+  q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
   filter8 = __lsx_vld(filter48, 16);
   tmp0_l = p1_l_in - p2_l_in;
   tmp0_l += q5_l_in;
   tmp0_l -= p7_l_in;
   tmp1_l += tmp0_l;
   out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
-  q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
+  q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
   tmp0_h = p1_h_in - p2_h_in;
   tmp0_h += q5_h_in;
   tmp0_h -= p7_h_in;
@@ -1175,14 +1165,14 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
   __lsx_vst(filter8, dst, 16 * 5);
 
   /* p0 */
-  q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
+  q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
   filter8 = __lsx_vld(filter48, 32);
   tmp0_l = p0_l_in - p1_l_in;
   tmp0_l += q6_l_in;
   tmp0_l -= p7_l_in;
   tmp1_l += tmp0_l;
   out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
-  q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
+  q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
   tmp0_h = p0_h_in - p1_h_in;
   tmp0_h += q6_h_in;
   tmp0_h -= p7_h_in;
@@ -1193,14 +1183,14 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
   __lsx_vst(filter8, dst, 16 * 6);
 
   /* q0 */
-  q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
+  q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
   filter8 = __lsx_vld(filter48, 48);
   tmp0_l = q7_l_in - p0_l_in;
   tmp0_l += q0_l_in;
   tmp0_l -= p7_l_in;
   tmp1_l += tmp0_l;
   out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
-  q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
+  q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
   tmp0_h = q7_h_in - p0_h_in;
   tmp0_h += q0_h_in;
   tmp0_h -= p7_h_in;
diff --git a/vpx_dsp/loongarch/loopfilter_4_lsx.c b/vpx_dsp/loongarch/loopfilter_4_lsx.c
index e8abf0523f..9300b5c5ae 100644
--- a/vpx_dsp/loongarch/loopfilter_4_lsx.c
+++ b/vpx_dsp/loongarch/loopfilter_4_lsx.c
@@ -27,9 +27,9 @@ void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch,
   DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
   q3 = __lsx_vldx(src, pitch3);
 
-  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
-  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
-  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
 
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
@@ -60,16 +60,16 @@ void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
   DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
   q3 = __lsx_vldx(src, pitch3);
 
-  thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
-  thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
   thresh0 = __lsx_vilvl_d(thresh1, thresh0);
 
-  b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr);
-  b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr);
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
   b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
 
-  limit0 = __lsx_vreplgr2vr_b(*limit0_ptr);
-  limit1 = __lsx_vreplgr2vr_b(*limit1_ptr);
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
   limit0 = __lsx_vilvl_d(limit1, limit0);
 
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
@@ -102,9 +102,9 @@ void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch,
   DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2);
   q3 = __lsx_vldx(src_tmp, pitch3);
 
-  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
-  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
-  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
 
   LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
                      q3);
@@ -169,16 +169,16 @@ void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
                       row9, row10, row11, row12, row13, row14, row15, p3, p2,
                       p1, p0, q0, q1, q2, q3);
 
-  thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
-  thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
   thresh0 = __lsx_vilvl_d(thresh1, thresh0);
 
-  b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr);
-  b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr);
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
   b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
 
-  limit0 = __lsx_vreplgr2vr_b(*limit0_ptr);
-  limit1 = __lsx_vreplgr2vr_b(*limit1_ptr);
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
   limit0 = __lsx_vilvl_d(limit1, limit0);
 
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
diff --git a/vpx_dsp/loongarch/loopfilter_8_lsx.c b/vpx_dsp/loongarch/loopfilter_8_lsx.c
index 358e221662..00219ba71d 100644
--- a/vpx_dsp/loongarch/loopfilter_8_lsx.c
+++ b/vpx_dsp/loongarch/loopfilter_8_lsx.c
@@ -17,11 +17,10 @@ void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
                               const uint8_t *thresh_ptr) {
   __m128i mask, hev, flat, thresh, b_limit, limit;
   __m128i p3, p2, p1, p0, q3, q2, q1, q0;
-  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out;
   __m128i p2_filter8, p1_filter8, p0_filter8;
   __m128i q0_filter8, q1_filter8, q2_filter8;
   __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
-  __m128i zero = __lsx_vldi(0);
 
   int32_t stride2 = stride << 1;
   int32_t stride3 = stride2 + stride;
@@ -34,16 +33,16 @@ void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
   DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
   q3 = __lsx_vldx(dst, stride3);
 
-  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
-  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
-  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
 
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
-  flat = __lsx_vilvl_d(zero, flat);
+  flat = __lsx_vilvl_d(flat, flat);
 
   if (__lsx_bz_v(flat)) {
     __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
@@ -51,35 +50,35 @@ void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
     __lsx_vstelm_d(q0_out, dst, 0, 0);
     __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
   } else {
-    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
-              p1_l, p0_l);
-    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
-              q2_l, q3_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
                 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
 
-    /* convert 16 bit output data into 8 bit */
-    DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero,
-              p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
-              q0_filter8);
-    DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8,
-              q2_filter8);
-    DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filter8, flat, p1_out, p1_filter8, flat,
-              p0_out, p0_filter8, flat, q0_out, q0_filter8, flat, p2_out,
-              p1_out, p0_out, q0_out);
-    DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filter8, flat, q2, q2_filter8, flat,
-              q1_out, q2_out);
+    DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8,
+              p1_filter8, q0_filter8);
+    q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8);
+
+    p2 = __lsx_vilvl_d(p1_out, p2);
+    p0_out = __lsx_vilvl_d(q0_out, p0_out);
+    q1_out = __lsx_vilvl_d(q2, q1_out);
+
+    DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat,
+              p2_out, p1_out);
+    p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat);
     dst -= stride3;
 
     __lsx_vstelm_d(p2_out, dst, 0, 0);
-    __lsx_vstelm_d(p1_out, dst + stride, 0, 0);
-    __lsx_vstelm_d(p0_out, dst + stride2, 0, 0);
-    __lsx_vstelm_d(q0_out, dst + stride3, 0, 0);
+    __lsx_vstelm_d(p2_out, dst + stride, 0, 1);
+    __lsx_vstelm_d(p1_out, dst + stride2, 0, 0);
+    __lsx_vstelm_d(p1_out, dst + stride3, 0, 1);
 
     dst += stride4;
-    __lsx_vstelm_d(q1_out, dst, 0, 0);
+    __lsx_vstelm_d(p0_out, dst, 0, 0);
     dst += stride;
-    __lsx_vstelm_d(q2_out, dst, 0, 0);
+    __lsx_vstelm_d(p0_out, dst, 0, 1);
   }
 }
 
@@ -89,14 +88,13 @@ void vpx_lpf_horizontal_8_dual_lsx(
     const uint8_t *limit1, const uint8_t *thresh1) {
   __m128i p3, p2, p1, p0, q3, q2, q1, q0;
   __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
   __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
   __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
   __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
   __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
   __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
   __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
-  __m128i zero = __lsx_vldi(0);
 
   int32_t stride2 = stride << 1;
   int32_t stride3 = stride2 + stride;
@@ -108,17 +106,17 @@ void vpx_lpf_horizontal_8_dual_lsx(
   DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
   q3 = __lsx_vldx(dst, stride3);
 
-  thresh = __lsx_vreplgr2vr_b(*thresh0);
-  tmp = __lsx_vreplgr2vr_b(*thresh1);
-  thresh = __lsx_vilvl_d(tmp, thresh);
+  thresh = __lsx_vldrepl_b(thresh0, 0);
+  p2_out = __lsx_vldrepl_b(thresh1, 0);
+  thresh = __lsx_vilvl_d(p2_out, thresh);
 
-  b_limit = __lsx_vreplgr2vr_b(*b_limit0);
-  tmp = __lsx_vreplgr2vr_b(*b_limit1);
-  b_limit = __lsx_vilvl_d(tmp, b_limit);
+  b_limit = __lsx_vldrepl_b(b_limit0, 0);
+  p2_out = __lsx_vldrepl_b(b_limit1, 0);
+  b_limit = __lsx_vilvl_d(p2_out, b_limit);
 
-  limit = __lsx_vreplgr2vr_b(*limit0);
-  tmp = __lsx_vreplgr2vr_b(*limit1);
-  limit = __lsx_vilvl_d(tmp, limit);
+  limit = __lsx_vldrepl_b(limit0, 0);
+  p2_out = __lsx_vldrepl_b(limit1, 0);
+  limit = __lsx_vilvl_d(p2_out, limit);
 
   /* mask and hev */
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
@@ -132,17 +130,15 @@ void vpx_lpf_horizontal_8_dual_lsx(
     __lsx_vst(q0_out, dst, 0);
     __lsx_vst(q1_out, dst + stride, 0);
   } else {
-    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
-              p1_l, p0_l);
-    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
-              q2_l, q3_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
 
-    DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h,
-              p1_h, p0_h);
-    DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h,
-              q2_h, q3_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
     VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
                 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
 
@@ -180,7 +176,6 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
   __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
   __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
   __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  __m128i vec0, vec1, vec2, vec3, vec4;
   __m128i zero = __lsx_vldi(0);
 
   int32_t stride2 = stride << 1;
@@ -200,9 +195,9 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
   LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
                      q3);
 
-  thresh = __lsx_vreplgr2vr_b(*thresh_ptr);
-  b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr);
-  limit = __lsx_vreplgr2vr_b(*limit_ptr);
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
 
   /* mask and hev */
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
@@ -217,20 +212,20 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
   /* if flat is zero for all pixels, then no need to calculate other filter */
   if (__lsx_bz_v(flat)) {
     /* Store 4 pixels p1-_q1 */
-    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    vec2 = __lsx_vilvl_h(vec1, vec0);
-    vec3 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    p2 = __lsx_vilvl_h(p1, p0);
+    p3 = __lsx_vilvh_h(p1, p0);
 
     dst -= 2;
-    __lsx_vstelm_w(vec2, dst, 0, 0);
-    __lsx_vstelm_w(vec2, dst + stride, 0, 1);
-    __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
-    __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_w(p2, dst + stride, 0, 1);
+    __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p2, dst + stride3, 0, 3);
     dst += stride4;
-    __lsx_vstelm_w(vec3, dst, 0, 0);
-    __lsx_vstelm_w(vec3, dst + stride, 0, 1);
-    __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
-    __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
+    __lsx_vstelm_w(p3, dst, 0, 0);
+    __lsx_vstelm_w(p3, dst + stride, 0, 1);
+    __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p3, dst + stride3, 0, 3);
   } else {
     DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
               p1_l, p0_l);
@@ -253,35 +248,34 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
     q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
 
     /* Store 6 pixels p2-_q2 */
-    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
-    vec2 = __lsx_vilvl_h(vec1, vec0);
-    vec3 = __lsx_vilvh_h(vec1, vec0);
-    vec4 = __lsx_vilvl_b(q2, q1);
-
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+    p1 = __lsx_vilvl_h(q3, p3);
+    p2 = __lsx_vilvh_h(q3, p3);
+    p3 = __lsx_vilvl_b(q2, q1);
     dst -= 3;
-    __lsx_vstelm_w(vec2, dst, 0, 0);
-    __lsx_vstelm_h(vec4, dst, 4, 0);
+    __lsx_vstelm_w(p1, dst, 0, 0);
+    __lsx_vstelm_h(p3, dst, 4, 0);
     dst += stride;
-    __lsx_vstelm_w(vec2, dst, 0, 1);
-    __lsx_vstelm_h(vec4, dst, 4, 1);
+    __lsx_vstelm_w(p1, dst, 0, 1);
+    __lsx_vstelm_h(p3, dst, 4, 1);
     dst += stride;
-    __lsx_vstelm_w(vec2, dst, 0, 2);
-    __lsx_vstelm_h(vec4, dst, 4, 2);
+    __lsx_vstelm_w(p1, dst, 0, 2);
+    __lsx_vstelm_h(p3, dst, 4, 2);
     dst += stride;
-    __lsx_vstelm_w(vec2, dst, 0, 3);
-    __lsx_vstelm_h(vec4, dst, 4, 3);
+    __lsx_vstelm_w(p1, dst, 0, 3);
+    __lsx_vstelm_h(p3, dst, 4, 3);
     dst += stride;
-    __lsx_vstelm_w(vec3, dst, 0, 0);
-    __lsx_vstelm_h(vec4, dst, 4, 4);
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_h(p3, dst, 4, 4);
     dst += stride;
-    __lsx_vstelm_w(vec3, dst, 0, 1);
-    __lsx_vstelm_h(vec4, dst, 4, 5);
+    __lsx_vstelm_w(p2, dst, 0, 1);
+    __lsx_vstelm_h(p3, dst, 4, 5);
     dst += stride;
-    __lsx_vstelm_w(vec3, dst, 0, 2);
-    __lsx_vstelm_h(vec4, dst, 4, 6);
+    __lsx_vstelm_w(p2, dst, 0, 2);
+    __lsx_vstelm_h(p3, dst, 4, 6);
     dst += stride;
-    __lsx_vstelm_w(vec3, dst, 0, 3);
-    __lsx_vstelm_h(vec4, dst, 4, 7);
+    __lsx_vstelm_w(p2, dst, 0, 3);
+    __lsx_vstelm_h(p3, dst, 4, 7);
   }
 }
 
@@ -301,8 +295,6 @@ void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
   __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
   __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
   __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
-  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  __m128i zero = __lsx_vldi(0);
   int32_t stride2 = stride << 1;
   int32_t stride3 = stride2 + stride;
   int32_t stride4 = stride2 << 1;
@@ -329,17 +321,17 @@ void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
                       row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
                       q3);
 
-  thresh = __lsx_vreplgr2vr_b(*thresh0);
-  vec0 = __lsx_vreplgr2vr_b(*thresh1);
-  thresh = __lsx_vilvl_d(vec0, thresh);
+  thresh = __lsx_vldrepl_b(thresh0, 0);
+  p1_out = __lsx_vldrepl_b(thresh1, 0);
+  thresh = __lsx_vilvl_d(p1_out, thresh);
 
-  b_limit = __lsx_vreplgr2vr_b(*b_limit0);
-  vec0 = __lsx_vreplgr2vr_b(*b_limit1);
-  b_limit = __lsx_vilvl_d(vec0, b_limit);
+  b_limit = __lsx_vldrepl_b(b_limit0, 0);
+  p1_out = __lsx_vldrepl_b(b_limit1, 0);
+  b_limit = __lsx_vilvl_d(p1_out, b_limit);
 
-  limit = __lsx_vreplgr2vr_b(*limit0);
-  vec0 = __lsx_vreplgr2vr_b(*limit1);
-  limit = __lsx_vilvl_d(vec0, limit);
+  limit = __lsx_vldrepl_b(limit0, 0);
+  p1_out = __lsx_vldrepl_b(limit1, 0);
+  limit = __lsx_vilvl_d(p1_out, limit);
 
   /* mask and hev */
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
@@ -350,44 +342,41 @@ void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
   /* if flat is zero for all pixels, then no need to calculate other filter */
   if (__lsx_bz_v(flat)) {
-    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    vec2 = __lsx_vilvl_h(vec1, vec0);
-    vec3 = __lsx_vilvh_h(vec1, vec0);
-    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    vec4 = __lsx_vilvl_h(vec1, vec0);
-    vec5 = __lsx_vilvh_h(vec1, vec0);
-
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    p2 = __lsx_vilvl_h(p1, p0);
+    p3 = __lsx_vilvh_h(p1, p0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    q2 = __lsx_vilvl_h(p1, p0);
+    q3 = __lsx_vilvh_h(p1, p0);
     dst -= 2;
-    __lsx_vstelm_w(vec2, dst, 0, 0);
-    __lsx_vstelm_w(vec2, dst + stride, 0, 1);
-    __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
-    __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_w(p2, dst + stride, 0, 1);
+    __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p2, dst + stride3, 0, 3);
     dst += stride4;
-    __lsx_vstelm_w(vec3, dst, 0, 0);
-    __lsx_vstelm_w(vec3, dst + stride, 0, 1);
-    __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
-    __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
+    __lsx_vstelm_w(p3, dst, 0, 0);
+    __lsx_vstelm_w(p3, dst + stride, 0, 1);
+    __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p3, dst + stride3, 0, 3);
     dst += stride4;
-    __lsx_vstelm_w(vec4, dst, 0, 0);
-    __lsx_vstelm_w(vec4, dst + stride, 0, 1);
-    __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
-    __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
+    __lsx_vstelm_w(q2, dst, 0, 0);
+    __lsx_vstelm_w(q2, dst + stride, 0, 1);
+    __lsx_vstelm_w(q2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(q2, dst + stride3, 0, 3);
     dst += stride4;
-    __lsx_vstelm_w(vec5, dst, 0, 0);
-    __lsx_vstelm_w(vec5, dst + stride, 0, 1);
-    __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
-    __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
+    __lsx_vstelm_w(q3, dst, 0, 0);
+    __lsx_vstelm_w(q3, dst + stride, 0, 1);
+    __lsx_vstelm_w(q3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(q3, dst + stride3, 0, 3);
   } else {
-    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
-              p1_l, p0_l);
-    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
-              q2_l, q3_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-    DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h,
-              p1_h, p0_h);
-    DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h,
-              q2_h, q3_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
 
     /* filter8 */
     VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
@@ -408,62 +397,62 @@ void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
     q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
     q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
 
-    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
-    vec3 = __lsx_vilvl_h(vec1, vec0);
-    vec4 = __lsx_vilvh_h(vec1, vec0);
-    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
-    vec6 = __lsx_vilvl_h(vec1, vec0);
-    vec7 = __lsx_vilvh_h(vec1, vec0);
-    vec2 = __lsx_vilvl_b(q2, q1);
-    vec5 = __lsx_vilvh_b(q2, q1);
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+    p2_filt8_l = __lsx_vilvl_h(q3, p3);
+    p2_filt8_h = __lsx_vilvh_h(q3, p3);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3);
+    p0_filt8_l = __lsx_vilvl_h(q3, p3);
+    p0_filt8_h = __lsx_vilvh_h(q3, p3);
+    q1_filt8_l = __lsx_vilvl_b(q2, q1);
+    q1_filt8_h = __lsx_vilvh_b(q2, q1);
 
     dst -= 3;
-    __lsx_vstelm_w(vec3, dst, 0, 0);
-    __lsx_vstelm_h(vec2, dst, 4, 0);
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 0);
     dst += stride;
-    __lsx_vstelm_w(vec3, dst, 0, 1);
-    __lsx_vstelm_h(vec2, dst, 4, 1);
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 1);
     dst += stride;
-    __lsx_vstelm_w(vec3, dst, 0, 2);
-    __lsx_vstelm_h(vec2, dst, 4, 2);
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 2);
     dst += stride;
-    __lsx_vstelm_w(vec3, dst, 0, 3);
-    __lsx_vstelm_h(vec2, dst, 4, 3);
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 3);
     dst += stride;
-    __lsx_vstelm_w(vec4, dst, 0, 0);
-    __lsx_vstelm_h(vec2, dst, 4, 4);
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 4);
     dst += stride;
-    __lsx_vstelm_w(vec4, dst, 0, 1);
-    __lsx_vstelm_h(vec2, dst, 4, 5);
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 5);
     dst += stride;
-    __lsx_vstelm_w(vec4, dst, 0, 2);
-    __lsx_vstelm_h(vec2, dst, 4, 6);
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 6);
     dst += stride;
-    __lsx_vstelm_w(vec4, dst, 0, 3);
-    __lsx_vstelm_h(vec2, dst, 4, 7);
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 7);
     dst += stride;
-    __lsx_vstelm_w(vec6, dst, 0, 0);
-    __lsx_vstelm_h(vec5, dst, 4, 0);
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 0);
     dst += stride;
-    __lsx_vstelm_w(vec6, dst, 0, 1);
-    __lsx_vstelm_h(vec5, dst, 4, 1);
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 1);
     dst += stride;
-    __lsx_vstelm_w(vec6, dst, 0, 2);
-    __lsx_vstelm_h(vec5, dst, 4, 2);
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 2);
     dst += stride;
-    __lsx_vstelm_w(vec6, dst, 0, 3);
-    __lsx_vstelm_h(vec5, dst, 4, 3);
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 3);
     dst += stride;
-    __lsx_vstelm_w(vec7, dst, 0, 0);
-    __lsx_vstelm_h(vec5, dst, 4, 4);
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 4);
     dst += stride;
-    __lsx_vstelm_w(vec7, dst, 0, 1);
-    __lsx_vstelm_h(vec5, dst, 4, 5);
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 5);
     dst += stride;
-    __lsx_vstelm_w(vec7, dst, 0, 2);
-    __lsx_vstelm_h(vec5, dst, 4, 6);
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 6);
     dst += stride;
-    __lsx_vstelm_w(vec7, dst, 0, 3);
-    __lsx_vstelm_h(vec5, dst, 4, 7);
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 7);
   }
 }

From 58919dd7f1c96faffc89f3875267690bc2bbd237 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 20 May 2022 19:15:47 -0700
Subject: [PATCH 306/926] GetTempOutFile(): use testing::TempDir()

rather than tmpfile(). this allows for setting the path with TEST_TMPDIR
and provides a valid default for android.

Change-Id: Iecb26f381b6a6ec97da62cfa0b7200f427440a2f
---
 test/video_source.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/test/video_source.h b/test/video_source.h
index 7a2dbe7ef7..349e3de37c 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -20,8 +20,14 @@
 #endif
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
+#include <memory>
 #include <string>
+
 #include "test/acm_random.h"
+#if !defined(_WIN32)
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#endif
 #include "vpx/vpx_encoder.h"
 
 namespace libvpx_test {
@@ -72,7 +78,23 @@ static FILE *GetTempOutFile(std::string *file_name) {
   }
   return NULL;
 #else
-  return tmpfile();
+  std::string temp_dir = testing::TempDir();
+  if (temp_dir.empty()) return NULL;
+  // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may
+  // use the value of an environment variable without checking for a trailing
+  // path delimiter.
+  if (temp_dir[temp_dir.size() - 1] != '/') temp_dir += '/';
+  const char name_template[] = "libvpxtest.XXXXXX";
+  std::unique_ptr<char[]> temp_file_name(
+      new char[temp_dir.size() + sizeof(name_template)]);
+  if (temp_file_name == nullptr) return NULL;
+  memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size());
+  memcpy(temp_file_name.get() + temp_dir.size(), name_template,
+         sizeof(name_template));
+  const int fd = mkstemp(temp_file_name.get());
+  if (fd == -1) return NULL;
+  *file_name = temp_file_name.get();
+  return fdopen(fd, "wb+");
 #endif
 }
 

From 9f1329f8ac88ea5d7c6ae5d6a57221c36cf85ac8 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 25 May 2022 23:39:42 +0000
Subject: [PATCH 307/926] Revert "[NEON] Optimize vp9_diamond_search_sad() for
 NEON"

This reverts commit 258affdeab68ed59e181368baa46e2f1d077b0ab.

Reason for revert:

Not bitexact with C version

Original change's description:
> [NEON] Optimize vp9_diamond_search_sad() for NEON
>
> About 50% improvement in comparison to the C function.
> I have followed the AVX version with some simplifications.
>
> Change-Id: I72ddbdb2fbc5ed8a7f0210703fe05523a37db1c9

Change-Id: I5c210b3dfe1f6dec525da857dd8c83946be566fc
---
 vp9/common/vp9_rtcd_defs.pl                   |   2 +-
 .../arm/neon/vp9_diamond_search_sad_neon.c    | 322 ------------------
 vp9/vp9cx.mk                                  |   1 -
 3 files changed, 1 insertion(+), 324 deletions(-)
 delete mode 100644 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index e6b65c96f0..4da0b6675b 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -175,7 +175,7 @@ ()
 # Motion search
 #
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx neon/;
+specialize qw/vp9_diamond_search_sad avx/;
 
 #
 # Apply temporal filter
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
deleted file mode 100644
index e56733d43e..0000000000
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <arm_neon.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
-  int_mv result;
-  result.as_mv.row = row;
-  result.as_mv.col = col;
-  return result;
-}
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
-  assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
-  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
-         comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff =
-      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
-/*****************************************************************************
- * This function utilizes 3 properties of the cost function lookup tables,   *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
- * vp9_encoder.c.                                                            *
- * For the joint cost:                                                       *
- *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
- * For the component costs:                                                  *
- *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
- *         (Equal costs for both components)                                 *
- *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
- *         (Cost function is even)                                           *
- * If these do not hold, then this function cannot be used without           *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties.                                  *
- *****************************************************************************/
-int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
-                                const search_site_config *cfg, MV *ref_mv,
-                                MV *best_mv, int search_param, int sad_per_bit,
-                                int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
-                                const MV *center_mv) {
-  static const uint32_t data[4] = { 0, 1, 2, 3 };
-  const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
-
-  const int32x4_t zero_s32 = vdupq_n_s32(0);
-  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
-  const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
-  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
-  const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
-
-  const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
-
-  const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
-  const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
-  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
-  const int tot_steps = cfg->total_steps - search_param;
-
-  const int_mv fcenter_mv =
-      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
-  const int16x8_t vfcmv = vdupq_n_s16(fcenter_mv.as_int);
-
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
-
-  int_mv bmv = pack_int_mv(ref_row, ref_col);
-  int_mv new_bmv = bmv;
-  int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
-
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const uint8_t *const in_what =
-      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-
-  // Work out the start point for the search
-  const uint8_t *best_address = in_what;
-  const uint8_t *new_best_address = best_address;
-#if defined(__aarch64__)
-  int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
-#else
-  int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
-#endif
-  unsigned int best_sad = INT_MAX;
-  int i, j, step;
-
-  // Check the prerequisite cost function properties that are easy to check
-  // in an assert. See the function-level documentation for details on all
-  // prerequisites.
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
-  *num00 = 0;
-
-  for (i = 0, step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
-      int16x8_t v_diff_mv_w;
-      int8x16_t v_inside_d;
-      uint32x4_t v_outside_d;
-      int32x4_t v_cost_d, v_sad_d;
-#if defined(__aarch64__)
-      int64x2_t v_blocka[2];
-#else
-      int32x4_t v_blocka[1];
-      uint32x2_t horiz_max_0, horiz_max_1;
-#endif
-
-      uint32_t horiz_max;
-      // Compute the candidate motion vectors
-      const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
-      const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
-      // Clamp them to the search bounds
-      int16x8_t v_these_mv_clamp_w = v_these_mv_w;
-      v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
-      v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
-      // The ones that did not change are inside the search area
-      v_inside_d = vreinterpretq_s8_u32(
-          vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
-                    vreinterpretq_s32_s16(v_these_mv_w)));
-
-      // If none of them are inside, then move on
-#if defined(__aarch64__)
-      horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
-#else
-      horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
-                             vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
-      horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
-      vst1_lane_u32(&horiz_max, horiz_max_1, 0);
-#endif
-      if (LIKELY(horiz_max == 0)) {
-        continue;
-      }
-
-      // The inverse mask indicates which of the MVs are outside
-      v_outside_d =
-          vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
-      // Shift right to keep the sign bit clear, we will use this later
-      // to set the cost to the maximum value.
-      v_outside_d = vshrq_n_u32(v_outside_d, 1);
-
-      // Compute the difference MV
-      v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
-      // We utilise the fact that the cost function is even, and use the
-      // absolute difference. This allows us to use unsigned indexes later
-      // and reduces cache pressure somewhat as only a half of the table
-      // is ever referenced.
-      v_diff_mv_w = vabsq_s16(v_diff_mv_w);
-
-      // Compute the SIMD pointer offsets.
-      {
-#if defined(__aarch64__)  //  sizeof(intptr_t) == 8
-        // Load the offsets
-        int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
-        int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
-        // Set the ones falling outside to zero
-        v_bo10_q = vandq_s64(
-            v_bo10_q,
-            vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
-        v_bo32_q = vandq_s64(
-            v_bo32_q,
-            vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
-        // Compute the candidate addresses
-        v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
-        v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
-#else  // sizeof(intptr_t) == 4
-        int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
-        v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
-        v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
-#endif
-      }
-
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
-
-      // Look up the component cost of the residual motion vector
-      {
-        uint32_t cost[4];
-        int16_t __attribute__((aligned(16))) rowcol[8];
-        vst1q_s16(rowcol, v_diff_mv_w);
-
-        // Note: This is a use case for gather instruction
-        cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
-        cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
-        cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
-        cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
-
-        v_cost_d = vld1q_s32((int32_t *)cost);
-      }
-
-      // Now add in the joint cost
-      {
-        const uint32x4_t v_sel_d =
-            vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
-        const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
-            vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
-                     vreinterpretq_u8_s32(v_joint_cost_0_d),
-                     vreinterpretq_u8_s32(v_joint_cost_1_d)));
-        v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
-      }
-
-      // Multiply by sad_per_bit
-      v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
-      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
-      v_cost_d =
-          vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
-      v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
-      // Add the cost to the sad
-      v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
-
-      // Make the motion vectors outside the search area have max cost
-      // by or'ing in the comparison mask, this way the minimum search won't
-      // pick them.
-      v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
-
-      // Find the minimum value and index horizontally in v_sad_d
-      {
-        uint32_t local_best_sad;
-#if defined(__aarch64__)
-        local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
-#else
-        uint32x2_t horiz_min_0 =
-            vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
-                     vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
-        uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
-        vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
-#endif
-
-        // Update the global minimum if the local minimum is smaller
-        if (LIKELY(local_best_sad < best_sad)) {
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
-          uint32_t local_best_idx;
-          const uint32x4_t v_sel_d =
-              vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
-          uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
-          v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
-
-#if defined(__aarch64__)
-          local_best_idx = vminvq_u32(v_mask_d);
-#else
-          horiz_min_0 =
-              vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
-          horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
-          vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
-#endif
-
-          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
-          best_sad = local_best_sad;
-        }
-      }
-    }
-
-    bmv = new_bmv;
-    best_address = new_best_address;
-
-    v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
-#if defined(__aarch64__)
-    v_ba_q = vdupq_n_s64((intptr_t)best_address);
-#else
-    v_ba_d = vdupq_n_s32((intptr_t)best_address);
-#endif
-
-    if (UNLIKELY(best_address == in_what)) {
-      (*num00)++;
-    }
-  }
-
-  *best_mv = bmv.as_mv;
-  return best_sad;
-}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c9afd9a347..92a7fddb9d 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -113,7 +113,6 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c

From 4832bcff20d34ed25a771f3704cfe0a046dbd0f9 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 13 May 2022 10:59:06 -0700
Subject: [PATCH 308/926] L2E: Add control type for the external rate control
 API

Two control types are defined: QP and GOP control.
Now the API only supports the QP model.

Change-Id: Ib3a712964b9d2282c93993ee56e0558e4795fb46
---
 test/vp9_ext_ratectrl_test.cc  |  1 +
 vp9/encoder/vp9_ext_ratectrl.c |  4 ++--
 vpx/vpx_ext_ratectrl.h         | 14 +++++++++++++-
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 60a350b84e..f6ce778456 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -176,6 +176,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_QP;
       rc_funcs.create_model = rc_create_model;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
       rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision;
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 9f0098ab5a..67f58329cc 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -143,7 +143,7 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
   if (ext_ratectrl == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (ext_ratectrl->ready) {
+  if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_QP) {
     vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_info_t encode_frame_info;
     encode_frame_info.show_index = show_index;
@@ -172,7 +172,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
   if (ext_ratectrl == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (ext_ratectrl->ready) {
+  if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_QP) {
     PSNR_STATS psnr;
     vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_result_t encode_frame_result;
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index a193e55953..5b426b6cf0 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -25,7 +25,15 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (1)
+#define VPX_EXT_RATECTRL_ABI_VERSION (2)
+
+/*!\brief The control type of the inference API.
+ * In VPX_RC_QP mode, the external rate control model determines the
+ * quantization parameter (QP) for each frame.
+ * In VPX_RC_GOP mode, the external rate control model determines the
+ * group of picture (GOP) of the video sequence.
+ */
+typedef enum vpx_rc_type { VPX_RC_QP = 0, VPX_RC_GOP = 1 } vpx_rc_type_t;
 
 /*!\brief Abstract rate control model handler
  *
@@ -327,6 +335,10 @@ typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)(
  * VP9E_SET_EXTERNAL_RATE_CONTROL.
  */
 typedef struct vpx_rc_funcs {
+  /*!
+   * The rate control type of this API.
+   */
+  vpx_rc_type_t rc_type;
   /*!
    * Create an external rate control model.
    */

From 3e7685cf621af3e876274f7be9fef83e3d35de3d Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 13 May 2022 13:42:28 -0700
Subject: [PATCH 309/926] L2E: Add vp9 GOP decision helper function

Add a helper function to call the external rate control model.

The helper function is placed in the function where vp9 determines
GOP decisions.

The helper function passes frame information, including current
frame show index, coding index, etc to the external rate control
model, and then receives GOP decisions.

The received GOP decisions overwrites the default GOP decision, only
when the external rate control model is set to be active via
the codec control.

The decision should satisfy a few constraints, for example, larger
than min_gf_interval; smaller than max_gf_interval. Otherwise,
return error.

Unit tests are added to test the new functionality.

Change-Id: Id129b4e1a91c844ee5c356a7801c862b1130a3d8
---
 test/vp9_ext_ratectrl_test.cc  | 147 ++++++++++++++++++++++++++++++++-
 vp9/encoder/vp9_encoder.c      |   8 +-
 vp9/encoder/vp9_ext_ratectrl.c |  33 +++++++-
 vp9/encoder/vp9_ext_ratectrl.h |   4 +
 vp9/encoder/vp9_firstpass.c    |  32 +++++++
 vp9/encoder/vp9_ratectrl.h     |   4 +
 vpx/vpx_ext_ratectrl.h         |  45 +++++++++-
 7 files changed, 267 insertions(+), 6 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index f6ce778456..e3e7afbf42 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -17,24 +17,43 @@
 #include "test/yuv_video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_ext_ratectrl.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 namespace {
 
 constexpr int kModelMagicNumber = 51396;
 constexpr uintptr_t PrivMagicNumber = 5566;
 constexpr int kFrameNum = 5;
+constexpr int kFrameNumGOP = 30;
 constexpr int kLosslessCodingIndex = 2;
+constexpr int kFixedGOPSize = 9;
+// The range check in vp9_cx_iface.c shows that the max
+// lag in buffer is MAX_LAG_BUFFERS (25):
+// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+constexpr int kMaxLagInFrames = 25;
+constexpr int kDefaultMinGfInterval = 4;
+constexpr int kDefaultMaxGfInterval = 16;
+// The two pass rate control does not respect the input
+// min_gf_interval and max_gf_interval.
+// See function "get_active_gf_inverval_range".
+// The numbers below are from manual inspection.
+constexpr int kReadMinGfInterval = 5;
+constexpr int kReadMaxGfInterval = 13;
 
 struct ToyRateCtrl {
   int magic_number;
   int coding_index;
+
+  int gop_id;
+  int frames_since_key;
+  int show_index;
 };
 
 vpx_rc_status_t rc_create_model(void *priv,
                                 const vpx_rc_config_t *ratectrl_config,
                                 vpx_rc_model_t *rate_ctrl_model_pt) {
   ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
-  EXPECT_NE(toy_rate_ctrl, nullptr);
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
   toy_rate_ctrl->magic_number = kModelMagicNumber;
   toy_rate_ctrl->coding_index = -1;
   *rate_ctrl_model_pt = toy_rate_ctrl;
@@ -48,6 +67,27 @@ vpx_rc_status_t rc_create_model(void *priv,
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_create_model_gop(void *priv,
+                                    const vpx_rc_config_t *ratectrl_config,
+                                    vpx_rc_model_t *rate_ctrl_model_pt) {
+  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+  toy_rate_ctrl->magic_number = kModelMagicNumber;
+  toy_rate_ctrl->gop_id = 0;
+  toy_rate_ctrl->frames_since_key = 0;
+  toy_rate_ctrl->show_index = 0;
+  toy_rate_ctrl->coding_index = 0;
+  *rate_ctrl_model_pt = toy_rate_ctrl;
+  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+  EXPECT_EQ(ratectrl_config->frame_width, 640);
+  EXPECT_EQ(ratectrl_config->frame_height, 360);
+  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP);
+  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000);
+  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_send_firstpass_stats(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_firstpass_stats_t *first_pass_stats) {
@@ -61,6 +101,19 @@ vpx_rc_status_t rc_send_firstpass_stats(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_send_firstpass_stats_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP);
+  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+  }
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_get_encodeframe_decision(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_info_t *encode_frame_info,
@@ -133,6 +186,41 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+                                    const vpx_rc_gop_info_t *gop_info,
+                                    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
+  EXPECT_EQ(gop_info->min_gf_interval, kReadMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kReadMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_id, 0);
+    toy_rate_ctrl->gop_id = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
+  }
+  EXPECT_EQ(gop_info->gop_id, toy_rate_ctrl->gop_id);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames =
+      VPXMIN(kFixedGOPSize, gop_info->frames_to_key);
+  gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->coding_index += gop_decision->gop_coding_frames;
+  ++toy_rate_ctrl->gop_id;
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_update_encodeframe_result(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_result_t *encode_frame_result) {
@@ -153,6 +241,18 @@ vpx_rc_status_t rc_update_encodeframe_result(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_update_encodeframe_result_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+  const int64_t ref_pixel_count = 640 * 360 * 3 / 2;
+  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
@@ -200,4 +300,49 @@ TEST_F(ExtRateCtrlTest, EncodeTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
+class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
+                           public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOP() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP;
+      rc_funcs.create_model = rc_create_model_gop;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
+      rc_funcs.get_gop_decision = rc_get_gop_decision;
+      rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
+  cfg_.rc_target_bitrate = 4000;
+  cfg_.g_lag_in_frames = kMaxLagInFrames;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
+      kFrameNumGOP));
+
+  ASSERT_NE(video.get(), nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
 }  // namespace
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 89b7c8e246..6d807b8abf 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4488,7 +4488,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       }
     }
 #endif  // CONFIG_RATE_CTRL
-    if (cpi->ext_ratectrl.ready && !ext_rc_recode) {
+    if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+        cpi->ext_ratectrl.funcs.rc_type == VPX_RC_QP) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
@@ -4548,7 +4549,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
 
-    if (cpi->ext_ratectrl.ready) {
+    if (cpi->ext_ratectrl.ready &&
+        cpi->ext_ratectrl.funcs.rc_type == VPX_RC_QP) {
       last_q_attempt = q;
       // In general, for the external rate control, we take the qindex provided
       // as input and encode the frame with this qindex faithfully. However,
@@ -5590,7 +5592,7 @@ static void encode_frame_to_data_rate(
   // build the bitstream
   vp9_pack_bitstream(cpi, dest, size);
 
-  {
+  if (cpi->ext_ratectrl.ready) {
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
     vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 67f58329cc..48c90913ee 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -172,7 +172,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
   if (ext_ratectrl == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_QP) {
+  if (ext_ratectrl->ready) {
     PSNR_STATS psnr;
     vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_result_t encode_frame_result;
@@ -198,3 +198,34 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
   }
   return VPX_CODEC_OK;
 }
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_GOP) {
+    vpx_rc_status_t rc_status;
+    rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
+                                                     gop_info, gop_decision);
+    if (gop_decision->use_alt_ref) {
+      const int arf_constraint =
+          gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
+          gop_decision->gop_coding_frames < gop_info->lag_in_frames;
+      if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
+    }
+    // TODO(chengchen): Take min and max gf interval from the model
+    // and overwrite libvpx's decision so that we can get rid
+    // of one of the checks here.
+    if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
+        gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
+            gop_info->max_gf_interval) {
+      return VPX_CODEC_ERROR;
+    }
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 74fd68b96d..b46b776b91 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -45,4 +45,8 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
     uint32_t input_bit_depth, const int actual_encoding_qindex);
 
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+    vpx_rc_gop_decision_t *gop_decision);
+
 #endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 67302ed035..6e1f797f4f 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2714,6 +2714,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   // frame in which case it will already have been done.
   if (is_key_frame == 0) {
     vp9_zero(twopass->gf_group);
+    ++rc->gop_id;
+  } else {
+    rc->gop_id = 0;
   }
 
   vpx_clear_system_state();
@@ -2751,6 +2754,35 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     }
   }
 #endif
+  // If the external rate control model for GOP is used, the gop decisions
+  // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
+  // will be overwritten.
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.rc_type == VPX_RC_GOP) {
+    vpx_codec_err_t codec_status;
+    vpx_rc_gop_decision_t gop_decision;
+    vpx_rc_gop_info_t gop_info;
+    gop_info.min_gf_interval = active_gf_interval.min;
+    gop_info.max_gf_interval = active_gf_interval.max;
+    gop_info.allow_alt_ref = allow_alt_ref;
+    gop_info.is_key_frame = is_key_frame;
+    gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
+    gop_info.frames_since_key = rc->frames_since_key;
+    gop_info.frames_to_key = rc->frames_to_key;
+    gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
+    gop_info.show_index = cm->current_video_frame;
+    gop_info.coding_index = cm->current_frame_coding_index;
+    gop_info.gop_id = rc->gop_id;
+
+    codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
+                                              &gop_decision);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_gop_decision() failed");
+    }
+    gop_coding_frames = gop_decision.gop_coding_frames;
+    use_alt_ref = gop_decision.use_alt_ref;
+  }
 
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 83a12cde73..42547d1a60 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -211,6 +211,10 @@ typedef struct {
   // Flag to constrain golden frame interval on key frame frequency for 1 pass
   // VBR.
   int constrain_gf_key_freq_onepass_vbr;
+
+  // The id of the current GOP. Start from zero.
+  // When a key frame is inserted, it resets to zero.
+  int gop_id;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 5b426b6cf0..e2c475a591 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -25,7 +25,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (2)
+#define VPX_EXT_RATECTRL_ABI_VERSION (3)
 
 /*!\brief The control type of the inference API.
  * In VPX_RC_QP mode, the external rate control model determines the
@@ -266,6 +266,32 @@ typedef struct vpx_rc_config {
   int frame_rate_den; /**< denominator of frame rate */
 } vpx_rc_config_t;
 
+/*!\brief Information passed to the external rate control model to
+ * help make GOP decisions.
+ */
+typedef struct vpx_rc_gop_info {
+  int min_gf_interval;      /**< mininum allowed gf interval */
+  int max_gf_interval;      /**< maximum allowed gf interval */
+  int allow_alt_ref;        /**< whether to allow the use of alt ref */
+  int is_key_frame;         /**< is the current frame a key frame */
+  int last_gop_use_alt_ref; /**< does the last gop use alt ref or not */
+  int frames_since_key;     /**< current frame distance to the last keyframe */
+  int frames_to_key;        /**< current frame distance to the next keyframe */
+  int lag_in_frames;        /**< number of lookahead source frames */
+  int show_index;           /**< display index of this frame, starts from zero*/
+  int coding_index;         /**< coding index of this frame, starts from zero*/
+  int gop_id; /**< the id of the current gop, starts from zero, resets to zero
+                 when a keyframe is set*/
+} vpx_rc_gop_info_t;
+
+/*!\brief The decision made by the external rate control model to set the
+ * group of picture.
+ */
+typedef struct vpx_rc_gop_decision {
+  int gop_coding_frames; /**< The number of frames of this GOP */
+  int use_alt_ref;       /**< Whether to use alt ref for this GOP */
+} vpx_rc_gop_decision_t;
+
 /*!\brief Create an external rate control model callback prototype
  *
  * This callback is invoked by the encoder to create an external rate control
@@ -318,6 +344,19 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_result_t *encode_frame_result);
 
+/*!\brief Get the GOP structure from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get GOP decisions from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  gop_info         information collected from the encoder
+ * \param[out] gop_decision     GOP decision from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision);
+
 /*!\brief Delete the external rate control model callback prototype
  *
  * This callback is invoked by the encoder to delete the external rate control
@@ -355,6 +394,10 @@ typedef struct vpx_rc_funcs {
    * Update encodeframe result to the external rate control model.
    */
   vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result;
+  /*!
+   * Get GOP decisions from the external rate control model.
+   */
+  vpx_rc_get_gop_decision_cb_fn_t get_gop_decision;
   /*!
    * Delete the external rate control model.
    */

From c304ec38d05040b74de4aacada62c4a336714341 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 27 May 2022 19:36:47 -0700
Subject: [PATCH 310/926] test/*: normalize use of nullptr

this is preferred over NULL in C++11

Change-Id: Ic48ddcc6dfb8975a57f6713549ad04d93db21415
---
 test/buffer.h               | 14 +++++++-------
 test/codec_factory.h        | 16 ++++++++--------
 test/decode_test_driver.h   |  2 +-
 test/encode_test_driver.h   |  2 +-
 test/ivf_video_source.h     | 14 +++++++-------
 test/register_state_check.h |  2 +-
 test/video_source.h         | 22 ++++++++++++----------
 test/webm_video_source.h    | 14 ++++++++------
 test/y4m_test.cc            |  4 ++--
 test/y4m_video_source.h     | 16 ++++++++--------
 test/yuv_video_source.h     | 16 +++++++++-------
 11 files changed, 64 insertions(+), 58 deletions(-)

diff --git a/test/buffer.h b/test/buffer.h
index b003d2f0d0..023939cedf 100644
--- a/test/buffer.h
+++ b/test/buffer.h
@@ -31,7 +31,7 @@ class Buffer {
       : width_(width), height_(height), top_padding_(top_padding),
         left_padding_(left_padding), right_padding_(right_padding),
         bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
-        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int top_padding, int left_padding,
          int right_padding, int bottom_padding, unsigned int alignment)
@@ -39,19 +39,19 @@ class Buffer {
         left_padding_(left_padding), right_padding_(right_padding),
         bottom_padding_(bottom_padding), alignment_(alignment),
         padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
-        raw_buffer_(NULL) {}
+        raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int padding)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
         bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
-        raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int padding, unsigned int alignment)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
         bottom_padding_(padding), alignment_(alignment), padding_value_(0),
-        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   ~Buffer() {
     if (alignment_) {
@@ -103,7 +103,7 @@ class Buffer {
   bool CheckValues(const Buffer<T> &a) const;
 
   bool Init() {
-    if (raw_buffer_ != NULL) return false;
+    if (raw_buffer_ != nullptr) return false;
     EXPECT_GT(width_, 0);
     EXPECT_GT(height_, 0);
     EXPECT_GE(top_padding_, 0);
@@ -126,7 +126,7 @@ class Buffer {
     } else {
       raw_buffer_ = new (std::nothrow) T[num_elements_];
     }
-    EXPECT_TRUE(raw_buffer_ != NULL);
+    EXPECT_NE(raw_buffer_, nullptr);
     SetPadding(std::numeric_limits<T>::max());
     return !::testing::Test::HasFailure();
   }
@@ -150,7 +150,7 @@ class Buffer {
 
 template <typename T>
 T *Buffer<T>::TopLeftPixel() const {
-  if (!raw_buffer_) return NULL;
+  if (!raw_buffer_) return nullptr;
   return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
 }
 
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 77ce49de9f..96092610c6 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -88,7 +88,7 @@ class VP8Decoder : public Decoder {
 #if CONFIG_VP8_DECODER
     return &vpx_codec_vp8_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -104,7 +104,7 @@ class VP8Encoder : public Encoder {
 #if CONFIG_VP8_ENCODER
     return &vpx_codec_vp8_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -124,7 +124,7 @@ class VP8CodecFactory : public CodecFactory {
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -139,7 +139,7 @@ class VP8CodecFactory : public CodecFactory {
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -184,7 +184,7 @@ class VP9Decoder : public Decoder {
 #if CONFIG_VP9_DECODER
     return &vpx_codec_vp9_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -200,7 +200,7 @@ class VP9Encoder : public Encoder {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -220,7 +220,7 @@ class VP9CodecFactory : public CodecFactory {
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -235,7 +235,7 @@ class VP9CodecFactory : public CodecFactory {
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 04876cdd7c..f446ab4664 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -24,7 +24,7 @@ class CompressedVideoSource;
 class DxDataIterator {
  public:
   explicit DxDataIterator(vpx_codec_ctx_t *decoder)
-      : decoder_(decoder), iter_(NULL) {}
+      : decoder_(decoder), iter_(nullptr) {}
 
   const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); }
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 38c61952eb..7085945f6a 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -49,7 +49,7 @@ enum TestMode {
 class CxDataIterator {
  public:
   explicit CxDataIterator(vpx_codec_ctx_t *encoder)
-      : encoder_(encoder), iter_(NULL) {}
+      : encoder_(encoder), iter_(nullptr) {}
 
   const vpx_codec_cx_pkt_t *Next() {
     return vpx_codec_get_cx_data(encoder_, &iter_);
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 22c05ecde9..a8ac4f154c 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -29,8 +29,9 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 class IVFVideoSource : public CompressedVideoSource {
  public:
   explicit IVFVideoSource(const std::string &file_name)
-      : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL),
-        frame_sz_(0), frame_(0), end_of_file_(false) {}
+      : file_name_(file_name), input_file_(nullptr),
+        compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
+        end_of_file_(false) {}
 
   virtual ~IVFVideoSource() {
     delete[] compressed_frame_buf_;
@@ -41,13 +42,12 @@ class IVFVideoSource : public CompressedVideoSource {
   virtual void Init() {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_ != NULL)
-        << "Allocate frame buffer failed";
+    ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
   }
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
@@ -68,7 +68,7 @@ class IVFVideoSource : public CompressedVideoSource {
   }
 
   void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     uint8_t frame_hdr[kIvfFrameHdrSize];
     // Check frame header and read a frame from input_file.
     if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
@@ -87,7 +87,7 @@ class IVFVideoSource : public CompressedVideoSource {
   }
 
   virtual const uint8_t *cxdata() const {
-    return end_of_file_ ? NULL : compressed_frame_buf_;
+    return end_of_file_ ? nullptr : compressed_frame_buf_;
   }
   virtual size_t frame_size() const { return frame_sz_; }
   virtual unsigned int frame_number() const { return frame_; }
diff --git a/test/register_state_check.h b/test/register_state_check.h
index 1746240c61..0b837dd042 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -56,7 +56,7 @@ class RegisterStateCheck {
  private:
   static bool StoreRegisters(CONTEXT *const context) {
     const HANDLE this_thread = GetCurrentThread();
-    EXPECT_TRUE(this_thread != NULL);
+    EXPECT_NE(this_thread, nullptr);
     context->ContextFlags = CONTEXT_FLOATING_POINT;
     const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
     EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
diff --git a/test/video_source.h b/test/video_source.h
index 349e3de37c..a10ff6fb09 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -42,7 +42,7 @@ namespace libvpx_test {
 // A simple function to encapsulate cross platform retrieval of test data path
 static std::string GetDataPath() {
   const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH");
-  if (data_path == NULL) {
+  if (data_path == nullptr) {
 #ifdef LIBVPX_TEST_DATA_PATH
     // In some environments, we cannot set environment variables
     // Instead, we set the data path by using a preprocessor symbol
@@ -76,10 +76,10 @@ static FILE *GetTempOutFile(std::string *file_name) {
       return fopen(fname, "wb+");
     }
   }
-  return NULL;
+  return nullptr;
 #else
   std::string temp_dir = testing::TempDir();
-  if (temp_dir.empty()) return NULL;
+  if (temp_dir.empty()) return nullptr;
   // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may
   // use the value of an environment variable without checking for a trailing
   // path delimiter.
@@ -87,12 +87,12 @@ static FILE *GetTempOutFile(std::string *file_name) {
   const char name_template[] = "libvpxtest.XXXXXX";
   std::unique_ptr<char[]> temp_file_name(
       new char[temp_dir.size() + sizeof(name_template)]);
-  if (temp_file_name == nullptr) return NULL;
+  if (temp_file_name == nullptr) return nullptr;
   memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size());
   memcpy(temp_file_name.get() + temp_dir.size(), name_template,
          sizeof(name_template));
   const int fd = mkstemp(temp_file_name.get());
-  if (fd == -1) return NULL;
+  if (fd == -1) return nullptr;
   *file_name = temp_file_name.get();
   return fdopen(fd, "wb+");
 #endif
@@ -114,7 +114,7 @@ class TempOutFile {
   void CloseFile() {
     if (file_) {
       fclose(file_);
-      file_ = NULL;
+      file_ = nullptr;
     }
   }
   FILE *file_;
@@ -133,7 +133,7 @@ class VideoSource {
   // Advance the cursor to the next frame
   virtual void Next() = 0;
 
-  // Get the current video frame, or NULL on End-Of-Stream.
+  // Get the current video frame, or nullptr on End-Of-Stream.
   virtual vpx_image_t *img() const = 0;
 
   // Get the presentation timestamp of the current frame.
@@ -155,7 +155,7 @@ class VideoSource {
 class DummyVideoSource : public VideoSource {
  public:
   DummyVideoSource()
-      : img_(NULL), limit_(100), width_(80), height_(64),
+      : img_(nullptr), limit_(100), width_(80), height_(64),
         format_(VPX_IMG_FMT_I420) {
     ReallocImage();
   }
@@ -172,7 +172,9 @@ class DummyVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
   virtual vpx_codec_pts_t pts() const { return frame_; }
@@ -212,7 +214,7 @@ class DummyVideoSource : public VideoSource {
 
   void ReallocImage() {
     vpx_img_free(img_);
-    img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
+    img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32);
     ASSERT_NE(img_, nullptr);
     raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
   }
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 6f55f7db7c..d245926298 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -26,11 +26,11 @@ class WebMVideoSource : public CompressedVideoSource {
  public:
   explicit WebMVideoSource(const std::string &file_name)
       : file_name_(file_name), vpx_ctx_(new VpxInputContext()),
-        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0),
+        webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0),
         end_of_file_(false) {}
 
   virtual ~WebMVideoSource() {
-    if (vpx_ctx_->file != NULL) fclose(vpx_ctx_->file);
+    if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file);
     webm_free(webm_ctx_);
     delete vpx_ctx_;
     delete webm_ctx_;
@@ -40,7 +40,7 @@ class WebMVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(vpx_ctx_->file != NULL)
+    ASSERT_NE(vpx_ctx_->file, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
@@ -54,7 +54,7 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void FillFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
@@ -63,7 +63,7 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void SeekToNextKeyFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     do {
       const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
       ASSERT_GE(status, 0) << "webm_read_frame failed";
@@ -74,7 +74,9 @@ class WebMVideoSource : public CompressedVideoSource {
     } while (!webm_ctx_->is_key_frame && !end_of_file_);
   }
 
-  virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; }
+  virtual const uint8_t *cxdata() const {
+    return end_of_file_ ? nullptr : buf_;
+  }
   virtual size_t frame_size() const { return buf_sz_; }
   virtual unsigned int frame_number() const { return frame_; }
 
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 89c6552c5d..32f2cd51d3 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -202,7 +202,7 @@ TEST(Y4MHeaderTest, RegularHeader) {
   EXPECT_EQ(0, fseek(f.file(), 0, 0));
 
   y4m_input y4m;
-  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
                            /*num_skip=*/0, /*only_420=*/0),
             0);
   EXPECT_EQ(y4m.pic_w, 4);
@@ -229,7 +229,7 @@ TEST(Y4MHeaderTest, LongHeader) {
   EXPECT_EQ(fseek(f.file(), 0, 0), 0);
 
   y4m_input y4m;
-  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
                            /*num_skip=*/0, /*only_420=*/0),
             0);
   EXPECT_EQ(y4m.pic_w, 4);
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 89aa2a44fc..71fbf31931 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -23,7 +23,7 @@ namespace libvpx_test {
 class Y4mVideoSource : public VideoSource {
  public:
   Y4mVideoSource(const std::string &file_name, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(new vpx_image_t()),
+      : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()),
         start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
         framerate_denominator_(0), y4m_() {}
 
@@ -35,13 +35,13 @@ class Y4mVideoSource : public VideoSource {
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
-    ASSERT_TRUE(input_file_ != NULL);
-    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
+    ASSERT_NE(input_file_, nullptr);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0));
     framerate_numerator_ = y4m_.fps_n;
     framerate_denominator_ = y4m_.fps_d;
     frame_ = 0;
@@ -62,7 +62,7 @@ class Y4mVideoSource : public VideoSource {
   }
 
   virtual vpx_image_t *img() const {
-    return (frame_ < limit_) ? img_.get() : NULL;
+    return (frame_ < limit_) ? img_.get() : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
@@ -80,7 +80,7 @@ class Y4mVideoSource : public VideoSource {
   virtual unsigned int limit() const { return limit_; }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
   }
@@ -101,9 +101,9 @@ class Y4mVideoSource : public VideoSource {
   void CloseSource() {
     y4m_input_close(&y4m_);
     y4m_ = y4m_input();
-    if (input_file_ != NULL) {
+    if (input_file_ != nullptr) {
       fclose(input_file_);
-      input_file_ = NULL;
+      input_file_ = nullptr;
     }
   }
 
diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h
index 383ab8f1b1..51948c0efb 100644
--- a/test/yuv_video_source.h
+++ b/test/yuv_video_source.h
@@ -27,8 +27,8 @@ class YUVVideoSource : public VideoSource {
   YUVVideoSource(const std::string &file_name, vpx_img_fmt format,
                  unsigned int width, unsigned int height, int rate_numerator,
                  int rate_denominator, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start),
-        limit_(limit), frame_(0), width_(0), height_(0),
+      : file_name_(file_name), input_file_(nullptr), img_(nullptr),
+        start_(start), limit_(limit), frame_(0), width_(0), height_(0),
         format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator),
         framerate_denominator_(rate_denominator) {
     // This initializes format_, raw_size_, width_, height_ and allocates img.
@@ -43,7 +43,7 @@ class YUVVideoSource : public VideoSource {
   virtual void Begin() {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
     if (start_) {
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
@@ -58,7 +58,9 @@ class YUVVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
   virtual vpx_codec_pts_t pts() const { return frame_; }
@@ -78,8 +80,8 @@ class YUVVideoSource : public VideoSource {
                        vpx_img_fmt format) {
     if (width != width_ || height != height_ || format != format_) {
       vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, format, width, height, 1);
-      ASSERT_TRUE(img_ != NULL);
+      img_ = vpx_img_alloc(nullptr, format, width, height, 1);
+      ASSERT_NE(img_, nullptr);
       width_ = width;
       height_ = height;
       format_ = format;
@@ -99,7 +101,7 @@ class YUVVideoSource : public VideoSource {
   }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
       limit_ = frame_;

From d353916ab5e614595b6e816b2baa9d532e06c9bc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 26 May 2022 19:38:01 -0700
Subject: [PATCH 311/926] libs.doxy_template: remove some obsolete variables

- COLS_IN_ALPHA_INDEX
  this was unused given ALPHABETICAL_INDEX = NO
- PERL_PATH / MSCGEN_PATH
  these were unused

quiets warnings with doxygen 1.9.1:
warning: Tag 'COLS_IN_ALPHA_INDEX' at line 1110 of file 'doxyfile' has
become obsolete.
warning: Tag 'PERL_PATH' at line 1105 of file 'doxyfile' has become
obsolete.
warning: Tag 'MSCGEN_PATH' at line 1126 of file 'doxyfile' has become
obsolete

Change-Id: I6229311afaa3318a3f9bcaf40fafcc5ea71ae271
---
 libs.doxy_template | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/libs.doxy_template b/libs.doxy_template
index 1eacc8fe2d..73e1b43c72 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -654,12 +654,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = NO
 
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all
 # classes will be put under the same header in the alphabetical index.
 # The IGNORE_PREFIX tag can be used to specify one or more prefixes that
@@ -1099,11 +1093,6 @@ ALLEXTERNALS           = NO
 
 EXTERNAL_GROUPS        = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
@@ -1117,14 +1106,6 @@ PERL_PATH              = /usr/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
-# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
-# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
-# be found in the default search path.
-
-MSCGEN_PATH            =
-
 # If set to YES, the inheritance and collaboration graphs will hide
 # inheritance and usage relations if the target is undocumented
 # or is not a class.

From 8f56e1c074712ffa937dc48a14d4b01e378a170f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 28 May 2022 15:26:29 -0700
Subject: [PATCH 312/926] resize_test: add TODO for test failure

DISABLED_TestExternalResizeSmallerWidthBiggerSize was added for
webm:1642, but never fixed

Bug: webm:1642
Change-Id: I0fa368a44dda550241ea997068c58eaff551233c
---
 test/resize_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index c57170ff9b..a71b2acb0c 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -578,6 +578,8 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   }
 }
 
+// TODO(https://crbug.com/webm/1642): This causes a segfault in
+// init_encode_frame_mb_context().
 TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) {
   ResizingVideoSource video;
   video.flag_codec_ = true;

From 9d279c88c3e8873c114298d69e919bfef45a1dab Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 28 May 2022 15:25:49 -0700
Subject: [PATCH 313/926] resize_test: add TODO for ResizeTest instantiation
 for VP9

this should match VP8 and use ONE_PASS_TEST_MODES, but currently the
code will produce integer sanitizer warnings and may segfault under
certain conditions

Bug: webm:1767,webm:1768
Change-Id: I6482ff1862f19716fde3d57522591bc61d76a84f
---
 test/resize_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index a71b2acb0c..ccee614070 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -796,6 +796,9 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
 }
 
 VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
+// TODO(https://crbug.com/webm/1767,https://crbug.com/webm/1768): VP9 should
+// use ONE_PASS_TEST_MODES for the ResizeTest instantiation after integer
+// sanitizer warnings and segfault are fixed.
 VP9_INSTANTIATE_TEST_SUITE(ResizeTest,
                            ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest,

From 365eebc147627ae83ec8b36077198d8cfb5e0128 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 27 May 2022 21:50:11 -0700
Subject: [PATCH 314/926] vp8e_set_config: setjmp before calling
 vp8_change_config

vp8_change_config may call vp8_alloc_compressor_data which expects
failures detected by CHECK_MEM_ERROR to not return.

Change-Id: Ib7fbf4af904bd9b539402bb61c8f87855eef2ad6
---
 vp8/vp8_cx_iface.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 21fed0e8ed..340f3e6638 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -473,14 +473,23 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (!res) {
-    ctx->cfg = *cfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
-    vp8_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
 
-  return res;
+  ctx->cpi->common.error.setjmp = 1;
+  ctx->cfg = *cfg;
+  set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+  vp8_change_config(ctx->cpi, &ctx->oxcf);
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) {

From 3997d9bc6286ba075879353b87678986cdbfa347 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 27 May 2022 21:50:11 -0700
Subject: [PATCH 315/926] vp9e_set_config: setjmp before calling
 vp9_change_config

vp9_change_config may call functions that perform allocations which
expect failures detected by CHECK_MEM_ERROR to not return.

Change-Id: I1dd1eca9c661ed157d51b4a6a77fc9f88236d794
---
 vp9/vp9_cx_iface.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b809ab3e6f..63d8f44878 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -780,7 +780,7 @@ static vpx_codec_err_t set_twopass_params_from_config(
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
-  int force_key = 0;
+  volatile int force_key = 0;
 
   if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
@@ -799,19 +799,28 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->extra_cfg);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (res == VPX_CODEC_OK) {
-    ctx->cfg = *cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
-    // On profile change, request a key frame
-    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
 
+  ctx->cfg = *cfg;
+  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+  set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
+  // On profile change, request a key frame
+  force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+  vp9_change_config(ctx->cpi, &ctx->oxcf);
+
   if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
 
-  return res;
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,

From 6549e76307631de7e37459fceb23b4eee4573620 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 28 May 2022 15:24:37 -0700
Subject: [PATCH 316/926] vp9_change_config: check vp9_alloc_loop_filter return

Change-Id: I4cba67a5ab192d1cf1dbfb5c039a93a4952b071e
---
 vp9/encoder/vp9_encoder.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6d807b8abf..a511aa7645 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2057,7 +2057,10 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
       cpi->external_resize = 0;
     } else if (cm->mi_alloc_size == new_mi_size &&
                (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
-      vp9_alloc_loop_filter(cm);
+      if (vp9_alloc_loop_filter(cm)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate loop filter data");
+      }
     }
   }
 

From 3dc6aa01bacc9818d4ccc0ee0f1b691ae0ec0315 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 1 Jun 2022 18:55:10 -0700
Subject: [PATCH 317/926] vp9,encoder: fix some integer sanitizer warnings

the issues fixed in this change are related to implicit conversions
between int / unsigned int:
vp9/encoder/vp9_segmentation.c:42:36: runtime error: implicit conversion
  from type 'int' of value -9 (32-bit, signed) to type 'unsigned int'
  changed the value to 4294967287 (32-bit, unsigned)
vpx_dsp/x86/sum_squares_sse2.c:36:52: runtime error: implicit conversion
  from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type
  'int' changed the value to -1 (32-bit, signed)
vpx_dsp/x86/sum_squares_sse2.c:36:67: runtime error: implicit conversion
  from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type
  'int' changed the value to -1 (32-bit, signed)
vp9/encoder/x86/vp9_diamond_search_sad_avx.c:81:45: runtime error:
  implicit conversion from type 'uint32_t' (aka 'unsigned int') of value
  4290576316 (32-bit, unsigned) to type 'int' changed the value to
  -4390980 (32-bit, signed)
vp9/encoder/vp9_rdopt.c:3472:31: runtime error: implicit conversion from
  type 'int' of value -1024 (32-bit, signed) to type 'uint16_t' (aka
  'unsigned short') changed the value to 64512 (16-bit, unsigned)

unsigned is forced for masks and int is used with intel intrinsics

Bug: webm:1767
Change-Id: Icfa4179e13bc98a36ac29586b60d65819d3ce9ee
Fixed: webm:1767
---
 test/resize_test.cc                          |  5 ++---
 vp9/encoder/vp9_rdopt.c                      |  2 +-
 vp9/encoder/vp9_segmentation.c               |  2 +-
 vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 10 +++++-----
 vpx_dsp/x86/sum_squares_sse2.c               |  2 +-
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index ccee614070..212ff46975 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -796,9 +796,8 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
 }
 
 VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
-// TODO(https://crbug.com/webm/1767,https://crbug.com/webm/1768): VP9 should
-// use ONE_PASS_TEST_MODES for the ResizeTest instantiation after integer
-// sanitizer warnings and segfault are fixed.
+// TODO(https://crbug.com/webm/1768): VP9 should use ONE_PASS_TEST_MODES for
+// the ResizeTest instantiation after segfault is fixed.
 VP9_INSTANTIATE_TEST_SUITE(ResizeTest,
                            ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0171a05720..3b574ef172 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3470,7 +3470,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   mode_skip_mask[INTRA_FRAME] |=
-      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+      (uint16_t) ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
 
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index a163297e6e..d75488a8e6 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -39,7 +39,7 @@ void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
 }
 void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
-  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+  seg->feature_mask[segment_id] &= ~(1u << feature_id);
 }
 
 void vp9_clear_segdata(struct segmentation *seg, int segment_id,
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index fcf50eb2a7..0e04a2f41f 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -76,9 +76,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                                int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
                                const MV *center_mv) {
   const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
-  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
+  const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
   const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
-  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
+  const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int);
 
   const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
 
@@ -96,14 +96,14 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
 
   const int_mv fcenter_mv =
       pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
-  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
+  const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
 
   const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
   const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
 
   int_mv bmv = pack_int_mv(ref_row, ref_col);
   int_mv new_bmv = bmv;
-  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
+  __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
 
   const int what_stride = x->plane[0].src.stride;
   const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
@@ -300,7 +300,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
     bmv = new_bmv;
     best_address = new_best_address;
 
-    v_bmv_w = _mm_set1_epi32(bmv.as_int);
+    v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
 #if VPX_ARCH_X86_64
     v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
 #else
diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c
index 14f3b35c01..df6514b2c4 100644
--- a/vpx_dsp/x86/sum_squares_sse2.c
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -33,7 +33,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
   } else {
     // Generic case
     int r = size;
-    const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+    const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1);
     __m128i v_acc_q = _mm_setzero_si128();
 
     assert(size % 8 == 0);

From 4e7c56332de65eb191cef5644c010e309302a993 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Tue, 31 May 2022 11:07:15 -0700
Subject: [PATCH 318/926] L2E: Return error when GOP model is not set

- Return error instead of OK when GOP model is not set.
- Update descriptions for a few variables.

Change-Id: I213f6b7085c487507c3935e7ce615e807f4474cc
---
 test/vp9_ext_ratectrl_test.cc  | 12 +++----
 vp9/encoder/vp9_ext_ratectrl.c | 43 +++++++++++------------
 vp9/encoder/vp9_firstpass.c    |  6 ++--
 vp9/encoder/vp9_ratectrl.h     |  4 +--
 vpx/vpx_ext_ratectrl.h         | 64 ++++++++++++++++++++++++++--------
 5 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index e3e7afbf42..1289e2db88 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -44,7 +44,7 @@ struct ToyRateCtrl {
   int magic_number;
   int coding_index;
 
-  int gop_id;
+  int gop_index;
   int frames_since_key;
   int show_index;
 };
@@ -73,7 +73,7 @@ vpx_rc_status_t rc_create_model_gop(void *priv,
   ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
   if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
   toy_rate_ctrl->magic_number = kModelMagicNumber;
-  toy_rate_ctrl->gop_id = 0;
+  toy_rate_ctrl->gop_index = 0;
   toy_rate_ctrl->frames_since_key = 0;
   toy_rate_ctrl->show_index = 0;
   toy_rate_ctrl->coding_index = 0;
@@ -198,13 +198,13 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
   if (gop_info->is_key_frame) {
     EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
     EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_id, 0);
-    toy_rate_ctrl->gop_id = 0;
+    EXPECT_EQ(gop_info->gop_index, 0);
+    toy_rate_ctrl->gop_index = 0;
     toy_rate_ctrl->frames_since_key = 0;
   } else {
     EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
   }
-  EXPECT_EQ(gop_info->gop_id, toy_rate_ctrl->gop_id);
+  EXPECT_EQ(gop_info->gop_index, toy_rate_ctrl->gop_index);
   EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
   EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
   EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
@@ -217,7 +217,7 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
   toy_rate_ctrl->show_index +=
       gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
   toy_rate_ctrl->coding_index += gop_decision->gop_coding_frames;
-  ++toy_rate_ctrl->gop_id;
+  ++toy_rate_ctrl->gop_index;
   return VPX_RC_OK;
 }
 
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 48c90913ee..ba57b86f60 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -202,30 +202,29 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
 vpx_codec_err_t vp9_extrc_get_gop_decision(
     EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
     vpx_rc_gop_decision_t *gop_decision) {
-  if (ext_ratectrl == NULL) {
+  vpx_rc_status_t rc_status;
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      ext_ratectrl->funcs.rc_type != VPX_RC_GOP) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_GOP) {
-    vpx_rc_status_t rc_status;
-    rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
-                                                     gop_info, gop_decision);
-    if (gop_decision->use_alt_ref) {
-      const int arf_constraint =
-          gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
-          gop_decision->gop_coding_frames < gop_info->lag_in_frames;
-      if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
-    }
-    // TODO(chengchen): Take min and max gf interval from the model
-    // and overwrite libvpx's decision so that we can get rid
-    // of one of the checks here.
-    if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
-        gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
-            gop_info->max_gf_interval) {
-      return VPX_CODEC_ERROR;
-    }
-    if (rc_status == VPX_RC_ERROR) {
-      return VPX_CODEC_ERROR;
-    }
+  rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
+                                                   gop_info, gop_decision);
+  if (gop_decision->use_alt_ref) {
+    const int arf_constraint =
+        gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
+        gop_decision->gop_coding_frames < gop_info->lag_in_frames;
+    if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
+  }
+  // TODO(chengchen): Take min and max gf interval from the model
+  // and overwrite libvpx's decision so that we can get rid
+  // of one of the checks here.
+  if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
+          gop_info->max_gf_interval) {
+    return VPX_CODEC_ERROR;
+  }
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
   }
   return VPX_CODEC_OK;
 }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 6e1f797f4f..2b3c174693 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2714,9 +2714,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   // frame in which case it will already have been done.
   if (is_key_frame == 0) {
     vp9_zero(twopass->gf_group);
-    ++rc->gop_id;
+    ++rc->gop_index;
   } else {
-    rc->gop_id = 0;
+    rc->gop_index = 0;
   }
 
   vpx_clear_system_state();
@@ -2772,7 +2772,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
     gop_info.show_index = cm->current_video_frame;
     gop_info.coding_index = cm->current_frame_coding_index;
-    gop_info.gop_id = rc->gop_id;
+    gop_info.gop_index = rc->gop_index;
 
     codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
                                               &gop_decision);
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 42547d1a60..48a21bd1cd 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -212,9 +212,9 @@ typedef struct {
   // VBR.
   int constrain_gf_key_freq_onepass_vbr;
 
-  // The id of the current GOP. Start from zero.
+  // The index of the current GOP. Start from zero.
   // When a key frame is inserted, it resets to zero.
-  int gop_id;
+  int gop_index;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index e2c475a591..db9af444c4 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -25,7 +25,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (3)
+#define VPX_EXT_RATECTRL_ABI_VERSION (4)
 
 /*!\brief The control type of the inference API.
  * In VPX_RC_QP mode, the external rate control model determines the
@@ -33,7 +33,7 @@ extern "C" {
  * In VPX_RC_GOP mode, the external rate control model determines the
  * group of picture (GOP) of the video sequence.
  */
-typedef enum vpx_rc_type { VPX_RC_QP = 0, VPX_RC_GOP = 1 } vpx_rc_type_t;
+typedef enum vpx_rc_type { VPX_RC_QP = 1, VPX_RC_GOP = 2 } vpx_rc_type_t;
 
 /*!\brief Abstract rate control model handler
  *
@@ -270,18 +270,54 @@ typedef struct vpx_rc_config {
  * help make GOP decisions.
  */
 typedef struct vpx_rc_gop_info {
-  int min_gf_interval;      /**< mininum allowed gf interval */
-  int max_gf_interval;      /**< maximum allowed gf interval */
-  int allow_alt_ref;        /**< whether to allow the use of alt ref */
-  int is_key_frame;         /**< is the current frame a key frame */
-  int last_gop_use_alt_ref; /**< does the last gop use alt ref or not */
-  int frames_since_key;     /**< current frame distance to the last keyframe */
-  int frames_to_key;        /**< current frame distance to the next keyframe */
-  int lag_in_frames;        /**< number of lookahead source frames */
-  int show_index;           /**< display index of this frame, starts from zero*/
-  int coding_index;         /**< coding index of this frame, starts from zero*/
-  int gop_id; /**< the id of the current gop, starts from zero, resets to zero
-                 when a keyframe is set*/
+  /*!
+   * Minimum allowed gf interval, fixed for the whole clip.
+   */
+  int min_gf_interval;
+  /*!
+   * Maximum allowed gf interval, fixed for the whole clip.
+   */
+  int max_gf_interval;
+  /*!
+   * Whether to allow the use of alt ref, can be changed per gop.
+   */
+  int allow_alt_ref;
+  /*!
+   * Is the current frame a key frame.
+   */
+  int is_key_frame;
+  /*!
+   * Does the previous gop use alt ref or not.
+   */
+  int last_gop_use_alt_ref;
+  /*!
+   * Current frame distance to the last keyframe, e.g., if Nth frame is a key,
+   * then the value of the N+1 th frame is 1.
+   */
+  int frames_since_key;
+  /*!
+   * Current frame distance to the next keyframe, e.g. if Nth frame is a key,
+   * then the value of frame N - 1 is 1.
+   */
+  int frames_to_key;
+  /*!
+   * Number of lookahead source frames.
+   */
+  int lag_in_frames;
+  /*!
+   * Display index (temporal stamp) of this frame in the whole clip,
+   * starts from zero.
+   */
+  int show_index;
+  /*!
+   * Coding index of this frame in the whole clip, starts from zero.
+   */
+  int coding_index;
+  /*!
+   * The index of the current gop, starts from zero, resets to zero
+   * when a keyframe is set.
+   */
+  int gop_index;
 } vpx_rc_gop_info_t;
 
 /*!\brief The decision made by the external rate control model to set the

From 36c9b2d6900f7decbf0f6d775f01c23248b714b4 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 2 Jun 2022 16:34:21 -0700
Subject: [PATCH 319/926] .gitignore: add android studio / vscode folders

Change-Id: I039a96bc33f55d9ba8bca9f9f6b69135659d2351
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5f26835386..99eeb92d0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,8 +7,10 @@
 *.o
 *~
 .cproject
+.idea
 .project
 .settings
+.vscode
 /*-*.mk
 /*.asm
 /*.doxy

From abfca783ed175c037dc53ce62ada06e1f3319bd8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 27 May 2022 19:36:47 -0700
Subject: [PATCH 320/926] test/*: normalize use of nullptr

this is preferred over NULL in C++11

Change-Id: Ic48ddcc6dfb8975a57f6713549ad04d93db21415
(cherry picked from commit c304ec38d05040b74de4aacada62c4a336714341)
---
 test/buffer.h               | 14 +++++++-------
 test/codec_factory.h        | 16 ++++++++--------
 test/decode_test_driver.h   |  2 +-
 test/encode_test_driver.h   |  2 +-
 test/ivf_video_source.h     | 14 +++++++-------
 test/register_state_check.h |  2 +-
 test/video_source.h         | 22 ++++++++++++----------
 test/webm_video_source.h    | 14 ++++++++------
 test/y4m_test.cc            |  4 ++--
 test/y4m_video_source.h     | 16 ++++++++--------
 test/yuv_video_source.h     | 16 +++++++++-------
 11 files changed, 64 insertions(+), 58 deletions(-)

diff --git a/test/buffer.h b/test/buffer.h
index b003d2f0d0..023939cedf 100644
--- a/test/buffer.h
+++ b/test/buffer.h
@@ -31,7 +31,7 @@ class Buffer {
       : width_(width), height_(height), top_padding_(top_padding),
         left_padding_(left_padding), right_padding_(right_padding),
         bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
-        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int top_padding, int left_padding,
          int right_padding, int bottom_padding, unsigned int alignment)
@@ -39,19 +39,19 @@ class Buffer {
         left_padding_(left_padding), right_padding_(right_padding),
         bottom_padding_(bottom_padding), alignment_(alignment),
         padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
-        raw_buffer_(NULL) {}
+        raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int padding)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
         bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
-        raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int padding, unsigned int alignment)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
         bottom_padding_(padding), alignment_(alignment), padding_value_(0),
-        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   ~Buffer() {
     if (alignment_) {
@@ -103,7 +103,7 @@ class Buffer {
   bool CheckValues(const Buffer<T> &a) const;
 
   bool Init() {
-    if (raw_buffer_ != NULL) return false;
+    if (raw_buffer_ != nullptr) return false;
     EXPECT_GT(width_, 0);
     EXPECT_GT(height_, 0);
     EXPECT_GE(top_padding_, 0);
@@ -126,7 +126,7 @@ class Buffer {
     } else {
       raw_buffer_ = new (std::nothrow) T[num_elements_];
     }
-    EXPECT_TRUE(raw_buffer_ != NULL);
+    EXPECT_NE(raw_buffer_, nullptr);
     SetPadding(std::numeric_limits<T>::max());
     return !::testing::Test::HasFailure();
   }
@@ -150,7 +150,7 @@ class Buffer {
 
 template <typename T>
 T *Buffer<T>::TopLeftPixel() const {
-  if (!raw_buffer_) return NULL;
+  if (!raw_buffer_) return nullptr;
   return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
 }
 
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 77ce49de9f..96092610c6 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -88,7 +88,7 @@ class VP8Decoder : public Decoder {
 #if CONFIG_VP8_DECODER
     return &vpx_codec_vp8_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -104,7 +104,7 @@ class VP8Encoder : public Encoder {
 #if CONFIG_VP8_ENCODER
     return &vpx_codec_vp8_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -124,7 +124,7 @@ class VP8CodecFactory : public CodecFactory {
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -139,7 +139,7 @@ class VP8CodecFactory : public CodecFactory {
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -184,7 +184,7 @@ class VP9Decoder : public Decoder {
 #if CONFIG_VP9_DECODER
     return &vpx_codec_vp9_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -200,7 +200,7 @@ class VP9Encoder : public Encoder {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -220,7 +220,7 @@ class VP9CodecFactory : public CodecFactory {
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -235,7 +235,7 @@ class VP9CodecFactory : public CodecFactory {
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 04876cdd7c..f446ab4664 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -24,7 +24,7 @@ class CompressedVideoSource;
 class DxDataIterator {
  public:
   explicit DxDataIterator(vpx_codec_ctx_t *decoder)
-      : decoder_(decoder), iter_(NULL) {}
+      : decoder_(decoder), iter_(nullptr) {}
 
   const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); }
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 38c61952eb..7085945f6a 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -49,7 +49,7 @@ enum TestMode {
 class CxDataIterator {
  public:
   explicit CxDataIterator(vpx_codec_ctx_t *encoder)
-      : encoder_(encoder), iter_(NULL) {}
+      : encoder_(encoder), iter_(nullptr) {}
 
   const vpx_codec_cx_pkt_t *Next() {
     return vpx_codec_get_cx_data(encoder_, &iter_);
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 22c05ecde9..a8ac4f154c 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -29,8 +29,9 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 class IVFVideoSource : public CompressedVideoSource {
  public:
   explicit IVFVideoSource(const std::string &file_name)
-      : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL),
-        frame_sz_(0), frame_(0), end_of_file_(false) {}
+      : file_name_(file_name), input_file_(nullptr),
+        compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
+        end_of_file_(false) {}
 
   virtual ~IVFVideoSource() {
     delete[] compressed_frame_buf_;
@@ -41,13 +42,12 @@ class IVFVideoSource : public CompressedVideoSource {
   virtual void Init() {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_ != NULL)
-        << "Allocate frame buffer failed";
+    ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
   }
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
@@ -68,7 +68,7 @@ class IVFVideoSource : public CompressedVideoSource {
   }
 
   void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     uint8_t frame_hdr[kIvfFrameHdrSize];
     // Check frame header and read a frame from input_file.
     if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
@@ -87,7 +87,7 @@ class IVFVideoSource : public CompressedVideoSource {
   }
 
   virtual const uint8_t *cxdata() const {
-    return end_of_file_ ? NULL : compressed_frame_buf_;
+    return end_of_file_ ? nullptr : compressed_frame_buf_;
   }
   virtual size_t frame_size() const { return frame_sz_; }
   virtual unsigned int frame_number() const { return frame_; }
diff --git a/test/register_state_check.h b/test/register_state_check.h
index 1746240c61..0b837dd042 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -56,7 +56,7 @@ class RegisterStateCheck {
  private:
   static bool StoreRegisters(CONTEXT *const context) {
     const HANDLE this_thread = GetCurrentThread();
-    EXPECT_TRUE(this_thread != NULL);
+    EXPECT_NE(this_thread, nullptr);
     context->ContextFlags = CONTEXT_FLOATING_POINT;
     const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
     EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
diff --git a/test/video_source.h b/test/video_source.h
index 349e3de37c..a10ff6fb09 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -42,7 +42,7 @@ namespace libvpx_test {
 // A simple function to encapsulate cross platform retrieval of test data path
 static std::string GetDataPath() {
   const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH");
-  if (data_path == NULL) {
+  if (data_path == nullptr) {
 #ifdef LIBVPX_TEST_DATA_PATH
     // In some environments, we cannot set environment variables
     // Instead, we set the data path by using a preprocessor symbol
@@ -76,10 +76,10 @@ static FILE *GetTempOutFile(std::string *file_name) {
       return fopen(fname, "wb+");
     }
   }
-  return NULL;
+  return nullptr;
 #else
   std::string temp_dir = testing::TempDir();
-  if (temp_dir.empty()) return NULL;
+  if (temp_dir.empty()) return nullptr;
   // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may
   // use the value of an environment variable without checking for a trailing
   // path delimiter.
@@ -87,12 +87,12 @@ static FILE *GetTempOutFile(std::string *file_name) {
   const char name_template[] = "libvpxtest.XXXXXX";
   std::unique_ptr<char[]> temp_file_name(
       new char[temp_dir.size() + sizeof(name_template)]);
-  if (temp_file_name == nullptr) return NULL;
+  if (temp_file_name == nullptr) return nullptr;
   memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size());
   memcpy(temp_file_name.get() + temp_dir.size(), name_template,
          sizeof(name_template));
   const int fd = mkstemp(temp_file_name.get());
-  if (fd == -1) return NULL;
+  if (fd == -1) return nullptr;
   *file_name = temp_file_name.get();
   return fdopen(fd, "wb+");
 #endif
@@ -114,7 +114,7 @@ class TempOutFile {
   void CloseFile() {
     if (file_) {
       fclose(file_);
-      file_ = NULL;
+      file_ = nullptr;
     }
   }
   FILE *file_;
@@ -133,7 +133,7 @@ class VideoSource {
   // Advance the cursor to the next frame
   virtual void Next() = 0;
 
-  // Get the current video frame, or NULL on End-Of-Stream.
+  // Get the current video frame, or nullptr on End-Of-Stream.
   virtual vpx_image_t *img() const = 0;
 
   // Get the presentation timestamp of the current frame.
@@ -155,7 +155,7 @@ class VideoSource {
 class DummyVideoSource : public VideoSource {
  public:
   DummyVideoSource()
-      : img_(NULL), limit_(100), width_(80), height_(64),
+      : img_(nullptr), limit_(100), width_(80), height_(64),
         format_(VPX_IMG_FMT_I420) {
     ReallocImage();
   }
@@ -172,7 +172,9 @@ class DummyVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
   virtual vpx_codec_pts_t pts() const { return frame_; }
@@ -212,7 +214,7 @@ class DummyVideoSource : public VideoSource {
 
   void ReallocImage() {
     vpx_img_free(img_);
-    img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
+    img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32);
     ASSERT_NE(img_, nullptr);
     raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
   }
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 6f55f7db7c..d245926298 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -26,11 +26,11 @@ class WebMVideoSource : public CompressedVideoSource {
  public:
   explicit WebMVideoSource(const std::string &file_name)
       : file_name_(file_name), vpx_ctx_(new VpxInputContext()),
-        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0),
+        webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0),
         end_of_file_(false) {}
 
   virtual ~WebMVideoSource() {
-    if (vpx_ctx_->file != NULL) fclose(vpx_ctx_->file);
+    if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file);
     webm_free(webm_ctx_);
     delete vpx_ctx_;
     delete webm_ctx_;
@@ -40,7 +40,7 @@ class WebMVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(vpx_ctx_->file != NULL)
+    ASSERT_NE(vpx_ctx_->file, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
@@ -54,7 +54,7 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void FillFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
@@ -63,7 +63,7 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void SeekToNextKeyFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     do {
       const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
       ASSERT_GE(status, 0) << "webm_read_frame failed";
@@ -74,7 +74,9 @@ class WebMVideoSource : public CompressedVideoSource {
     } while (!webm_ctx_->is_key_frame && !end_of_file_);
   }
 
-  virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; }
+  virtual const uint8_t *cxdata() const {
+    return end_of_file_ ? nullptr : buf_;
+  }
   virtual size_t frame_size() const { return buf_sz_; }
   virtual unsigned int frame_number() const { return frame_; }
 
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 89c6552c5d..32f2cd51d3 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -202,7 +202,7 @@ TEST(Y4MHeaderTest, RegularHeader) {
   EXPECT_EQ(0, fseek(f.file(), 0, 0));
 
   y4m_input y4m;
-  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
                            /*num_skip=*/0, /*only_420=*/0),
             0);
   EXPECT_EQ(y4m.pic_w, 4);
@@ -229,7 +229,7 @@ TEST(Y4MHeaderTest, LongHeader) {
   EXPECT_EQ(fseek(f.file(), 0, 0), 0);
 
   y4m_input y4m;
-  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
                            /*num_skip=*/0, /*only_420=*/0),
             0);
   EXPECT_EQ(y4m.pic_w, 4);
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 89aa2a44fc..71fbf31931 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -23,7 +23,7 @@ namespace libvpx_test {
 class Y4mVideoSource : public VideoSource {
  public:
   Y4mVideoSource(const std::string &file_name, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(new vpx_image_t()),
+      : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()),
         start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
         framerate_denominator_(0), y4m_() {}
 
@@ -35,13 +35,13 @@ class Y4mVideoSource : public VideoSource {
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
-    ASSERT_TRUE(input_file_ != NULL);
-    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
+    ASSERT_NE(input_file_, nullptr);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0));
     framerate_numerator_ = y4m_.fps_n;
     framerate_denominator_ = y4m_.fps_d;
     frame_ = 0;
@@ -62,7 +62,7 @@ class Y4mVideoSource : public VideoSource {
   }
 
   virtual vpx_image_t *img() const {
-    return (frame_ < limit_) ? img_.get() : NULL;
+    return (frame_ < limit_) ? img_.get() : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
@@ -80,7 +80,7 @@ class Y4mVideoSource : public VideoSource {
   virtual unsigned int limit() const { return limit_; }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
   }
@@ -101,9 +101,9 @@ class Y4mVideoSource : public VideoSource {
   void CloseSource() {
     y4m_input_close(&y4m_);
     y4m_ = y4m_input();
-    if (input_file_ != NULL) {
+    if (input_file_ != nullptr) {
       fclose(input_file_);
-      input_file_ = NULL;
+      input_file_ = nullptr;
     }
   }
 
diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h
index 383ab8f1b1..51948c0efb 100644
--- a/test/yuv_video_source.h
+++ b/test/yuv_video_source.h
@@ -27,8 +27,8 @@ class YUVVideoSource : public VideoSource {
   YUVVideoSource(const std::string &file_name, vpx_img_fmt format,
                  unsigned int width, unsigned int height, int rate_numerator,
                  int rate_denominator, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start),
-        limit_(limit), frame_(0), width_(0), height_(0),
+      : file_name_(file_name), input_file_(nullptr), img_(nullptr),
+        start_(start), limit_(limit), frame_(0), width_(0), height_(0),
         format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator),
         framerate_denominator_(rate_denominator) {
     // This initializes format_, raw_size_, width_, height_ and allocates img.
@@ -43,7 +43,7 @@ class YUVVideoSource : public VideoSource {
   virtual void Begin() {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
     if (start_) {
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
@@ -58,7 +58,9 @@ class YUVVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
   virtual vpx_codec_pts_t pts() const { return frame_; }
@@ -78,8 +80,8 @@ class YUVVideoSource : public VideoSource {
                        vpx_img_fmt format) {
     if (width != width_ || height != height_ || format != format_) {
       vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, format, width, height, 1);
-      ASSERT_TRUE(img_ != NULL);
+      img_ = vpx_img_alloc(nullptr, format, width, height, 1);
+      ASSERT_NE(img_, nullptr);
       width_ = width;
       height_ = height;
       format_ = format;
@@ -99,7 +101,7 @@ class YUVVideoSource : public VideoSource {
   }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
       limit_ = frame_;

From 9546c699fb06a8e0f0c92be92159034a28005ac2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 2 Jun 2022 19:38:03 -0700
Subject: [PATCH 321/926] libs.mk,build/make/Makefile: make test targets
 ordinary rules

this fixes a regression in make 4.2 and still present in 4.3 causing
double colon rules to be serialized which breaks sharding done by the
test and test-no-data-check rules. these targets only define one set of
rules so ordinary rules work unlike clean. install may be another
candidate, but that's left for a follow up.

Change-Id: I9f074eca2ad266eeca6e31aae2e9f31eec8680e0
Tested: make 3.81, 4.1, 4.2, 4.2.1, 4.3
---
 build/make/Makefile | 14 +++++++-------
 libs.mk             |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/build/make/Makefile b/build/make/Makefile
index b7a873cc81..5c38c18e57 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -21,9 +21,9 @@ all: .DEFAULT
 clean:: .DEFAULT
 exampletest: .DEFAULT
 install:: .DEFAULT
-test:: .DEFAULT
-test-no-data-check:: .DEFAULT
-testdata:: .DEFAULT
+test: .DEFAULT
+test-no-data-check: .DEFAULT
+testdata: .DEFAULT
 utiltest: .DEFAULT
 exampletest-no-data-check utiltest-no-data-check: .DEFAULT
 test_%: .DEFAULT ;
@@ -111,13 +111,13 @@ exampletest:
 .PHONY: install
 install::
 .PHONY: test
-test::
+test:
 .PHONY: testdata
-testdata::
+testdata:
 .PHONY: utiltest
 utiltest:
 .PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
-test-no-data-check::
+test-no-data-check:
 exampletest-no-data-check utiltest-no-data-check:
 
 # Force to realign stack always on OS/2
@@ -465,6 +465,6 @@ INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins
 all: $(BUILD_TARGETS)
 install:: $(INSTALL_TARGETS)
 dist: $(INSTALL_TARGETS)
-test::
+test:
 
 .SUFFIXES:  # Delete default suffix rules
diff --git a/libs.mk b/libs.mk
index b59bb45e1e..b87f8bf8b0 100644
--- a/libs.mk
+++ b/libs.mk
@@ -536,7 +536,7 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	  esac \
 	)
 
-testdata:: $(LIBVPX_TEST_DATA)
+testdata: $(LIBVPX_TEST_DATA)
 	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
           [ -x "$$(which shasum)" ] && sha1sum=shasum;\
           [ -x "$$(which sha1)" ] && sha1sum=sha1;\
@@ -709,15 +709,15 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS)
 
 define test_shard_template
-test:: test_shard.$(1)
-test-no-data-check:: test_shard_ndc.$(1)
+test: test_shard.$(1)
+test-no-data-check: test_shard_ndc.$(1)
 test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
 	@set -e; \
 	 export GTEST_SHARD_INDEX=$(1); \
 	 export GTEST_TOTAL_SHARDS=$(2); \
 	 $(LIBVPX_TEST_BIN)
 test_shard.$(1): testdata
-.PHONY: test_shard.$(1)
+.PHONY: test_shard.$(1) test_shard_ndc.$(1)
 endef
 
 NUM_SHARDS := 10

From 386f25be5353978c28866e442f844f6bd2a1537e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 27 May 2022 21:50:11 -0700
Subject: [PATCH 322/926] vp8e_set_config: setjmp before calling
 vp8_change_config

vp8_change_config may call vp8_alloc_compressor_data which expects
failures detected by CHECK_MEM_ERROR to not return.

Change-Id: Ib7fbf4af904bd9b539402bb61c8f87855eef2ad6
(cherry picked from commit 365eebc147627ae83ec8b36077198d8cfb5e0128)
---
 vp8/vp8_cx_iface.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 21fed0e8ed..340f3e6638 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -473,14 +473,23 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (!res) {
-    ctx->cfg = *cfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
-    vp8_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
 
-  return res;
+  ctx->cpi->common.error.setjmp = 1;
+  ctx->cfg = *cfg;
+  set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+  vp8_change_config(ctx->cpi, &ctx->oxcf);
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) {

From 6e7f6363965b959f026956903fcd2aeb55698110 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 27 May 2022 21:50:11 -0700
Subject: [PATCH 323/926] vp9e_set_config: setjmp before calling
 vp9_change_config

vp9_change_config may call functions that perform allocations which
expect failures detected by CHECK_MEM_ERROR to not return.

Change-Id: I1dd1eca9c661ed157d51b4a6a77fc9f88236d794
(cherry picked from commit 3997d9bc6286ba075879353b87678986cdbfa347)
---
 vp9/vp9_cx_iface.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b809ab3e6f..63d8f44878 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -780,7 +780,7 @@ static vpx_codec_err_t set_twopass_params_from_config(
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
-  int force_key = 0;
+  volatile int force_key = 0;
 
   if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
@@ -799,19 +799,28 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->extra_cfg);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (res == VPX_CODEC_OK) {
-    ctx->cfg = *cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
-    // On profile change, request a key frame
-    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
 
+  ctx->cfg = *cfg;
+  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+  set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
+  // On profile change, request a key frame
+  force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+  vp9_change_config(ctx->cpi, &ctx->oxcf);
+
   if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
 
-  return res;
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,

From db754a45328c03dcb27414727b35d61294eb172c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 28 May 2022 15:24:37 -0700
Subject: [PATCH 324/926] vp9_change_config: check vp9_alloc_loop_filter return

Change-Id: I4cba67a5ab192d1cf1dbfb5c039a93a4952b071e
(cherry picked from commit 6549e76307631de7e37459fceb23b4eee4573620)
---
 vp9/encoder/vp9_encoder.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 89b7c8e246..d3f4d1ea81 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2057,7 +2057,10 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
       cpi->external_resize = 0;
     } else if (cm->mi_alloc_size == new_mi_size &&
                (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
-      vp9_alloc_loop_filter(cm);
+      if (vp9_alloc_loop_filter(cm)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate loop filter data");
+      }
     }
   }
 

From 3c5529e313b3a08cfdcd393efbab3df04aab6074 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 3 Jun 2022 15:08:48 -0700
Subject: [PATCH 325/926] L2E: Use bit mask to represent control type

The bit mask allows us to easily add an additional control mode
which both the QP and GOP are controlled by an external model.

Change-Id: I49f676f622a6e70feb2a39dc97a4e5050b7f4760
---
 vp9/encoder/vp9_encoder.c      | 4 ++--
 vp9/encoder/vp9_ext_ratectrl.c | 4 ++--
 vp9/encoder/vp9_firstpass.c    | 2 +-
 vpx/vpx_ext_ratectrl.h         | 8 +++++++-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index a511aa7645..2fb88a9f70 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4492,7 +4492,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 #endif  // CONFIG_RATE_CTRL
     if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
-        cpi->ext_ratectrl.funcs.rc_type == VPX_RC_QP) {
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
@@ -4553,7 +4553,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 
     if (cpi->ext_ratectrl.ready &&
-        cpi->ext_ratectrl.funcs.rc_type == VPX_RC_QP) {
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
       last_q_attempt = q;
       // In general, for the external rate control, we take the qindex provided
       // as input and encode the frame with this qindex faithfully. However,
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index ba57b86f60..d5b60b02a6 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -143,7 +143,7 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
   if (ext_ratectrl == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_QP) {
+  if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
     vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_info_t encode_frame_info;
     encode_frame_info.show_index = show_index;
@@ -204,7 +204,7 @@ vpx_codec_err_t vp9_extrc_get_gop_decision(
     vpx_rc_gop_decision_t *gop_decision) {
   vpx_rc_status_t rc_status;
   if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
-      ext_ratectrl->funcs.rc_type != VPX_RC_GOP) {
+      (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
     return VPX_CODEC_INVALID_PARAM;
   }
   rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 2b3c174693..9ed59e8ead 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2758,7 +2758,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
   // will be overwritten.
   if (cpi->ext_ratectrl.ready &&
-      cpi->ext_ratectrl.funcs.rc_type == VPX_RC_GOP) {
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) {
     vpx_codec_err_t codec_status;
     vpx_rc_gop_decision_t gop_decision;
     vpx_rc_gop_info_t gop_info;
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index db9af444c4..c3feac55e4 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -32,8 +32,14 @@ extern "C" {
  * quantization parameter (QP) for each frame.
  * In VPX_RC_GOP mode, the external rate control model determines the
  * group of picture (GOP) of the video sequence.
+ * In VPX_RC_GOP_QP mode, the external rate control model determines
+ * both the QP and the GOP.
  */
-typedef enum vpx_rc_type { VPX_RC_QP = 1, VPX_RC_GOP = 2 } vpx_rc_type_t;
+typedef enum vpx_rc_type {
+  VPX_RC_QP = 1 << 0,
+  VPX_RC_GOP = 1 << 1,
+  VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP
+} vpx_rc_type_t;
 
 /*!\brief Abstract rate control model handler
  *

From b21634073652cb7e73e52168dc2fa9c1b9a30adf Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 6 Jun 2022 14:53:46 -0700
Subject: [PATCH 326/926] L2E: send first pass stats before gop decisions

This change let the encoder send first pass stats before gop
decisioins so that external models could make use of it.

Change-Id: Iafc7eddab93aa77ceaf8e1f7663a52b27d94af80
---
 vp9/encoder/vp9_encoder.c   | 10 ----------
 vp9/encoder/vp9_firstpass.c | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 2fb88a9f70..87f369d3bb 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5802,16 +5802,6 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags,
                         ENCODE_FRAME_RESULT *encode_frame_result) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
-
-  if (cpi->common.current_frame_coding_index == 0) {
-    VP9_COMMON *cm = &cpi->common;
-    const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
-        &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
-    if (codec_status != VPX_CODEC_OK) {
-      vpx_internal_error(&cm->error, codec_status,
-                         "vp9_extrc_send_firstpass_stats() failed");
-    }
-  }
 #if CONFIG_MISMATCH_DEBUG
   mismatch_move_frame_idx_w();
 #endif
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 9ed59e8ead..44a4cec0f9 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3493,6 +3493,16 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   const int show_idx = cm->current_video_frame;
 
+  if (cpi->common.current_frame_coding_index == 0) {
+    VP9_COMMON *cm = &cpi->common;
+    const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
+        &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_send_firstpass_stats() failed");
+    }
+  }
+
   if (!twopass->stats_in) return;
 
   // Configure image size specific vizier parameters

From 7bb4bd36122e55236148190625507efc90ec5b75 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 6 Jun 2022 15:46:41 -0700
Subject: [PATCH 327/926] L2E: rename 'gop_index' to 'gop_global_index'

'gop_index' has already been used in vpx_rc_encodeframe_info_t,
which represents the frame index inside the current
group of picture (gop).

We therefore use 'gop_global_index' to represent the index of
the current gop to avoid duplicate names.

Change-Id: I3eb8987dd878f650649b013e0036e23d0846b5f0
---
 test/vp9_ext_ratectrl_test.cc | 12 ++++++------
 vp9/encoder/vp9_firstpass.c   |  6 +++---
 vp9/encoder/vp9_ratectrl.h    |  2 +-
 vpx/vpx_ext_ratectrl.h        |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 1289e2db88..a3af4e98f0 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -44,7 +44,7 @@ struct ToyRateCtrl {
   int magic_number;
   int coding_index;
 
-  int gop_index;
+  int gop_global_index;
   int frames_since_key;
   int show_index;
 };
@@ -73,7 +73,7 @@ vpx_rc_status_t rc_create_model_gop(void *priv,
   ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
   if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
   toy_rate_ctrl->magic_number = kModelMagicNumber;
-  toy_rate_ctrl->gop_index = 0;
+  toy_rate_ctrl->gop_global_index = 0;
   toy_rate_ctrl->frames_since_key = 0;
   toy_rate_ctrl->show_index = 0;
   toy_rate_ctrl->coding_index = 0;
@@ -198,13 +198,13 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
   if (gop_info->is_key_frame) {
     EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
     EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_index, 0);
-    toy_rate_ctrl->gop_index = 0;
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
     toy_rate_ctrl->frames_since_key = 0;
   } else {
     EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
   }
-  EXPECT_EQ(gop_info->gop_index, toy_rate_ctrl->gop_index);
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
   EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
   EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
   EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
@@ -217,7 +217,7 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
   toy_rate_ctrl->show_index +=
       gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
   toy_rate_ctrl->coding_index += gop_decision->gop_coding_frames;
-  ++toy_rate_ctrl->gop_index;
+  ++toy_rate_ctrl->gop_global_index;
   return VPX_RC_OK;
 }
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 44a4cec0f9..e121ac80e1 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2714,9 +2714,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   // frame in which case it will already have been done.
   if (is_key_frame == 0) {
     vp9_zero(twopass->gf_group);
-    ++rc->gop_index;
+    ++rc->gop_global_index;
   } else {
-    rc->gop_index = 0;
+    rc->gop_global_index = 0;
   }
 
   vpx_clear_system_state();
@@ -2772,7 +2772,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
     gop_info.show_index = cm->current_video_frame;
     gop_info.coding_index = cm->current_frame_coding_index;
-    gop_info.gop_index = rc->gop_index;
+    gop_info.gop_global_index = rc->gop_global_index;
 
     codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
                                               &gop_decision);
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 48a21bd1cd..96a8fd3f1d 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -214,7 +214,7 @@ typedef struct {
 
   // The index of the current GOP. Start from zero.
   // When a key frame is inserted, it resets to zero.
-  int gop_index;
+  int gop_global_index;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index c3feac55e4..6e41abaf40 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -81,7 +81,7 @@ typedef struct vpx_rc_encodeframe_info {
   int show_index;   /**< display index, starts from zero*/
   int coding_index; /**< coding index, starts from zero*/
   /*!
-   * index in group of picture, starts from zero.
+   * index of the current frame in this group of picture, starts from zero.
    */
   int gop_index;
   int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/
@@ -323,7 +323,7 @@ typedef struct vpx_rc_gop_info {
    * The index of the current gop, starts from zero, resets to zero
    * when a keyframe is set.
    */
-  int gop_index;
+  int gop_global_index;
 } vpx_rc_gop_info_t;
 
 /*!\brief The decision made by the external rate control model to set the

From 7b1b9f7cd23e085d97c26ed026d2c817d78a14d6 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 6 Jun 2022 16:46:55 -0700
Subject: [PATCH 328/926] L2E: Use libvpx's default q in case of invalid
 external value

If the external model recommends an invalid q value, we use the
default q selected by libvpx's rate control strategy.

We update the test so that when the external model wants to control
GOP decision, it could get per frame information and just recommend
an invalid q.

Change-Id: I69be4b0ee0800e7ab0706d305242bb87f001b1f7
---
 test/vp9_ext_ratectrl_test.cc | 79 ++++++++++++++++++++++++++++++++++-
 vp9/encoder/vp9_encoder.c     |  8 +++-
 vpx/vpx_ext_ratectrl.h        |  9 ++++
 3 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index a3af4e98f0..66d4233766 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -186,6 +186,81 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_get_encodeframe_decision_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+  }
+
+  if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+  }
+
+  if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+  }
+
+  if (encode_frame_info->coding_index == 3 ||
+      encode_frame_info->coding_index == 12 ||
+      encode_frame_info->coding_index == 21) {
+    EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+  }
+
+  if (encode_frame_info->coding_index == 11 ||
+      encode_frame_info->coding_index == 20 ||
+      encode_frame_info->coding_index == 29) {
+    EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+  }
+
+  if (encode_frame_info->coding_index >= 30) {
+    EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
                                     const vpx_rc_gop_info_t *gop_info,
                                     vpx_rc_gop_decision_t *gop_decision) {
@@ -216,7 +291,6 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
       gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
   toy_rate_ctrl->show_index +=
       gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->coding_index += gop_decision->gop_coding_frames;
   ++toy_rate_ctrl->gop_global_index;
   return VPX_RC_OK;
 }
@@ -319,9 +393,10 @@ class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
 
       vpx_rc_funcs_t rc_funcs;
-      rc_funcs.rc_type = VPX_RC_GOP;
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
       rc_funcs.get_gop_decision = rc_get_gop_decision;
       rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
       rc_funcs.delete_model = rc_delete_model;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 87f369d3bb..85bd706629 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4510,8 +4510,12 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
         vpx_internal_error(&cm->error, codec_status,
                            "vp9_extrc_get_encodeframe_decision() failed");
       }
-      q = encode_frame_decision.q_index;
-      ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
+      // If the external model recommends a reserved value, we use
+      // libvpx's default q.
+      if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
+        q = encode_frame_decision.q_index;
+        ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
+      }
     }
 
     vp9_set_quantizer(cpi, q);
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 6e41abaf40..b57148c69b 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -48,11 +48,20 @@ typedef enum vpx_rc_type {
  */
 typedef void *vpx_rc_model_t;
 
+/*!\brief A reserved value for the q index.
+ * If the external rate control model returns this value,
+ * the encoder will use the default q selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_Q -1
+
 /*!\brief Encode frame decision made by the external rate control model
  *
  * The encoder will receive the decision from the external rate control model
  * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
  *
+ * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q.
+ *
  * If max_frame_size = 0, the encoding ignores max frame size limit.
  * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
  * If the encoded frame size is larger than max_frame_size, the frame is

From 46bfeed2c9a7e52c8d1624f9e388af137e02ff19 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 10 Jun 2022 13:52:31 -0700
Subject: [PATCH 329/926] Convert EncoderTest::last_pts_ to a local variable

Convert the data member EncoderTest::last_pts_ to a local variable in
the EncoderTest::RunLoop() and VP9FrameSizeTestsLarge::RunLoop()
methods. EncoderTest::last_pts_ is only used in these two methods, and
these two methods first set EncoderTest::last_pts_ to 0 before using it.
So EncoderTest::last_pts_ is effectively a local variable in these two
methods.

Note that several subclasses of EncoderTest declare their own last_pts_
data member and use it to calculate the data rate. Apparently their own
last_pts_ data member hides the same-named data member in the base
class. Although this is allowed by C++, this is very confusing.

Change-Id: I55ce1cf8cc62e07333d8a902d65b46343a3d5881
---
 test/encode_test_driver.cc | 6 +++---
 test/encode_test_driver.h  | 4 +---
 test/frame_size_tests.cc   | 6 +++---
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 1ce39eaeff..9ca15ae4d3 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -169,7 +169,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
   ASSERT_TRUE(passes_ == 1 || passes_ == 2);
   for (unsigned int pass = 0; pass < passes_; pass++) {
-    last_pts_ = 0;
+    vpx_codec_pts_t last_pts = 0;
 
     if (passes_ == 1) {
       cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -225,8 +225,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
               has_dxdata = true;
             }
-            ASSERT_GE(pkt->data.frame.pts, last_pts_);
-            last_pts_ = pkt->data.frame.pts;
+            ASSERT_GE(pkt->data.frame.pts, last_pts);
+            last_pts = pkt->data.frame.pts;
             FramePktHook(pkt);
             break;
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 7085945f6a..f6bb841d8c 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -206,8 +206,7 @@ class Encoder {
 class EncoderTest {
  protected:
   explicit EncoderTest(const CodecFactory *codec)
-      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
-        last_pts_(0) {
+      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) {
     // Default to 1 thread.
     cfg_.g_threads = 1;
   }
@@ -291,7 +290,6 @@ class EncoderTest {
   TwopassStatsStore stats_;
   unsigned long init_flags_;
   unsigned long frame_flags_;
-  vpx_codec_pts_t last_pts_;
 };
 
 }  // namespace libvpx_test
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index d85c193e0b..8a0eb71ba0 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -111,7 +111,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
 
     ASSERT_TRUE(passes_ == 1 || passes_ == 2);
     for (unsigned int pass = 0; pass < passes_; pass++) {
-      last_pts_ = 0;
+      vpx_codec_pts_t last_pts = 0;
 
       if (passes_ == 1) {
         cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -144,8 +144,8 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
           again = true;
           switch (pkt->kind) {
             case VPX_CODEC_CX_FRAME_PKT:
-              ASSERT_GE(pkt->data.frame.pts, last_pts_);
-              last_pts_ = pkt->data.frame.pts;
+              ASSERT_GE(pkt->data.frame.pts, last_pts);
+              last_pts = pkt->data.frame.pts;
               FramePktHook(pkt);
               break;
 

From 013ec5722ce88bebcdcf32b1496fcca413199336 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 13 Jun 2022 16:29:31 -0400
Subject: [PATCH 330/926] Restore backward compatibility

This CL breaks the backward compatibility:

1365e7e1a vp9-svc: Remove VP9E_SET_TEMPORAL_LAYERING_MODE

Forcing the value of the next element

Bug: webm:1752
Change-Id: I83c774b3aa6cca25f2f14995590fb20c0a1668d4
---
 vpx/vp8cx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index f5dc6d1188..a61238cb10 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -516,7 +516,7 @@ enum vp8e_enc_control_id {
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_MIN_GF_INTERVAL,
+  VP9E_SET_MIN_GF_INTERVAL = 48,
 
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *

From 878266136bfcd8a9132cd60091c7d35943348dbc Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 13 Jun 2022 16:36:07 -0400
Subject: [PATCH 331/926] Update AUTHORS

Bug: webm:1752
Change-Id: I08b4100a0e8c003cd9a7bdaf72926c268e02d53c
---
 AUTHORS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/AUTHORS b/AUTHORS
index 174cc59ee7..fffda63360 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -68,11 +68,13 @@ Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
 Han Shen <shenhan@google.com>
+Hao Chen <chenhao@loongson.cn>
 Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
 Hien Ho <hienho@google.com>
 Hirokazu Honda <hiroh@chromium.org>
 Hui Su <huisu@google.com>
+Ilya Kurdyukov <jpegqs@gmail.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
@@ -91,9 +93,11 @@ Jeff Petkau <jpet@chromium.org>
 Jeremy Leconte <jleconte@google.com>
 Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
+Jianhui Dai <jianhui.j.dai@intel.com>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
 jinbo <jinbo-hf@loongson.cn>
+Jin Bo <jinbo@loongson.cn>
 Jingning Han <jingning@google.com>
 Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
@@ -111,6 +115,7 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
+Konstantinos Margaritis <konma@vectorcamp.gr>
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
@@ -118,6 +123,7 @@ Liu Peng <pengliu.mail@gmail.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Luc Trudeau <luc@trud.ca>
+Lu Wang <wanglu@loongson.cn>
 Makoto Kato <makoto.kt@gmail.com>
 Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
@@ -131,6 +137,7 @@ Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Mikko Koivisto <mikko.koivisto@unikie.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
@@ -206,6 +213,7 @@ xiwei gu <guxiwei-hf@loongson.cn>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
+yuanhecai <yuanhecai@loongson.cn>
 Yue Chen <yuec@google.com>
 Yun Liu <yliuyliu@google.com>
 Yunqing Wang <yunqingwang@google.com>

From 027d710a6e6490e84b209985ba49ec01b02482a8 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 13 Jun 2022 16:29:31 -0400
Subject: [PATCH 332/926] Restore backward compatibility

This CL breaks the backward compatibility:

1365e7e1a vp9-svc: Remove VP9E_SET_TEMPORAL_LAYERING_MODE

Forcing the value of the next element

Bug: webm:1752
Change-Id: I83c774b3aa6cca25f2f14995590fb20c0a1668d4
(cherry picked from commit 013ec5722ce88bebcdcf32b1496fcca413199336)
---
 vpx/vp8cx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index f5dc6d1188..a61238cb10 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -516,7 +516,7 @@ enum vp8e_enc_control_id {
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_MIN_GF_INTERVAL,
+  VP9E_SET_MIN_GF_INTERVAL = 48,
 
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *

From 95d196fdf45edf57015cc18635c52b30a93522fd Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 16 Jun 2022 18:22:52 -0700
Subject: [PATCH 333/926] vp9_cx_iface: set default cpu_used=5
 w/CONFIG_REALTIME_ONLY

this avoids a crash if cpu-used is not explicitly set as there are some
(unnecessary) checks against use_nonrd_pick_mode which would cause
encoding to be skipped if the old default of 0 were used

Bug: webm:1773
Change-Id: I62fba5fb51d8afa422689b7de3f03e8f7570e50b
Fixed: webm:1773
---
 test/realtime_test.cc | 24 +++++++++++++++++-------
 vp9/vp9_cx_iface.c    |  6 +++++-
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index 853b942824..c5de2dcb35 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -35,17 +35,19 @@ class RealtimeTest
   }
 
   void BeginPassHook(unsigned int /*pass*/) override {
+#if !CONFIG_REALTIME_ONLY
     // TODO(tomfinegan): We're changing the pass value here to make sure
     // we get frames when real time mode is combined with |g_pass| set to
     // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
     // the pass value based on the mode passed into EncoderTest::SetMode(),
     // which overrides the one specified in SetUp() above.
     cfg_.g_pass = VPX_RC_FIRST_PASS;
+#endif
   }
 
   void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
+    if (video->frame() == 0 && set_cpu_used_) {
       encoder->Control(VP8E_SET_CPUUSED, 8);
     }
   }
@@ -70,15 +72,23 @@ class RealtimeTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void TestEncode() {
+    ::libvpx_test::RandomVideoSource video;
+    video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    video.set_limit(kFramesToEncode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    EXPECT_EQ(kFramesToEncode, frame_packets_);
+  }
+
   int frame_packets_;
+  bool set_cpu_used_ = true;
 };
 
-TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
-  ::libvpx_test::RandomVideoSource video;
-  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
-  video.set_limit(kFramesToEncode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_EQ(kFramesToEncode, frame_packets_);
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); }
+
+TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) {
+  set_cpu_used_ = false;
+  TestEncode();
 }
 
 TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 63d8f44878..05ac9e1691 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -65,7 +65,11 @@ typedef struct vp9_extracfg {
 } vp9_extracfg;
 
 static struct vp9_extracfg default_extra_cfg = {
-  0,                     // cpu_used
+#if CONFIG_REALTIME_ONLY
+  5,  // cpu_used
+#else
+  0,  // cpu_used
+#endif
   1,                     // enable_auto_alt_ref
   0,                     // noise_sensitivity
   0,                     // sharpness

From 08b86d76224453ef9cbab4b10a48617715d9a14e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 16 Jun 2022 18:43:44 -0700
Subject: [PATCH 334/926] vp9_encode_sb_row: remove a branch
 w/CONFIG_REALTIME_ONLY

replace the check on use_nonrd_pick_mode with an assert. this is only a
start, there are many branches that could be removed that check mode ==
REALTIME, etc. with this configuration.

Bug: webm:1773
Change-Id: I38cf9f83e7c085eb8e87d5cf6db7dc75359b611b
---
 vp9/encoder/vp9_encodeframe.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5f08fa6f60..a9f392bf51 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -5856,9 +5856,12 @@ void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row,
   get_start_tok(cpi, tile_row, tile_col, mi_row, &tok);
   cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok;
 
+#if CONFIG_REALTIME_ONLY
+  assert(cpi->sf.use_nonrd_pick_mode);
+  encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#else
   if (cpi->sf.use_nonrd_pick_mode)
     encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
-#if !CONFIG_REALTIME_ONLY
   else
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
 #endif

From 03d4c6fed9ce058ff27cdf523275d301073f6651 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 17 Jun 2022 14:14:10 -0400
Subject: [PATCH 335/926] Update CHANGELOG and version info

A stale codec control was removed, but compatibility was restored.

New codec control was added.

Bump *current* and *age*, and keep *revision* as 0.

Bug: webm:1752
Bug: webm:1757

Change-Id: I76179f129a10c06d897b5c62462808ed9b9c2923
---
 CHANGELOG | 29 +++++++++++++++++++++++++++++
 libs.mk   |  2 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index ea2fc9d81c..f061751ae3 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,32 @@
+2022-06-17 v1.12.0 "Torrent Duck"
+  This release adds optimizations for Loongarch, adds support for vp8 in the
+  real-time rate control library, upgrades GoogleTest to v1.11.0, updates
+  libwebm to libwebm-1.0.0.28-20-g206d268, and includes numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+
+    vp8 support in the real-time rate control library.
+    New codec control VP8E_SET_RTC_EXTERNAL_RATECTRL is added.
+
+    Configure support for darwin21 is added.
+
+    GoogleTest is upgraded to v1.11.0.
+
+    libwebm is updated to libwebm-1.0.0.28-20-g206d268.
+
+  - Enhancement:
+    Numerous improvements on checking memory allocations.
+    Optimizations for Loongarch.
+    Code clean-up.
+
+  - Bug fixes:
+    Fix to a crash related to {vp8/vp9}_set_roi_map.
+    Fix to compiling failure with -Wformat-nonliteral.
+    Fix to integer overflow with vp9 with high resolution content.
+    Fix to AddNoiseTest failure with ARMv7.
+    Fix to libvpx Null-dereference READ in vp8.
+
 2021-09-27 v1.11.0 "Smew Duck"
   This maintenance release adds support for VBR mode in VP9 rate control
   interface, new codec controls to get quantization parameters and loop filter
diff --git a/libs.mk b/libs.mk
index b59bb45e1e..00e49a19d7 100644
--- a/libs.mk
+++ b/libs.mk
@@ -313,7 +313,7 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
 SO_VERSION_MAJOR := 7
-SO_VERSION_MINOR := 0
+SO_VERSION_MINOR := 1
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib

From 5df4da402675f2d829acfb006d7cd27dd387d776 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 21 Jun 2022 14:54:16 -0400
Subject: [PATCH 336/926] Update CHANGELOG for L2E

Bug: webm:1752
Change-Id: I5335e0360501503d5c162be4bbdef3ad73151e9f
---
 CHANGELOG | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index f061751ae3..cd4e8ba43a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -15,6 +15,9 @@
 
     libwebm is updated to libwebm-1.0.0.28-20-g206d268.
 
+    Allow SimpleEncode environment to take target level as input to match
+    the level conformance in vp9.
+
   - Enhancement:
     Numerous improvements on checking memory allocations.
     Optimizations for Loongarch.

From 158468202510a5d70ec1f0e9f4231c9ccacda8cd Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 16 Jun 2022 18:22:52 -0700
Subject: [PATCH 337/926] vp9_cx_iface: set default cpu_used=5
 w/CONFIG_REALTIME_ONLY

this avoids a crash if cpu-used is not explicitly set as there are some
(unnecessary) checks against use_nonrd_pick_mode which would cause
encoding to be skipped if the old default of 0 were used

Bug: webm:1773
Change-Id: I62fba5fb51d8afa422689b7de3f03e8f7570e50b
Fixed: webm:1773
(cherry picked from commit 95d196fdf45edf57015cc18635c52b30a93522fd)
---
 test/realtime_test.cc | 24 +++++++++++++++++-------
 vp9/vp9_cx_iface.c    |  6 +++++-
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index 853b942824..c5de2dcb35 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -35,17 +35,19 @@ class RealtimeTest
   }
 
   void BeginPassHook(unsigned int /*pass*/) override {
+#if !CONFIG_REALTIME_ONLY
     // TODO(tomfinegan): We're changing the pass value here to make sure
     // we get frames when real time mode is combined with |g_pass| set to
     // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
     // the pass value based on the mode passed into EncoderTest::SetMode(),
     // which overrides the one specified in SetUp() above.
     cfg_.g_pass = VPX_RC_FIRST_PASS;
+#endif
   }
 
   void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
+    if (video->frame() == 0 && set_cpu_used_) {
       encoder->Control(VP8E_SET_CPUUSED, 8);
     }
   }
@@ -70,15 +72,23 @@ class RealtimeTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void TestEncode() {
+    ::libvpx_test::RandomVideoSource video;
+    video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    video.set_limit(kFramesToEncode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    EXPECT_EQ(kFramesToEncode, frame_packets_);
+  }
+
   int frame_packets_;
+  bool set_cpu_used_ = true;
 };
 
-TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
-  ::libvpx_test::RandomVideoSource video;
-  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
-  video.set_limit(kFramesToEncode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_EQ(kFramesToEncode, frame_packets_);
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); }
+
+TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) {
+  set_cpu_used_ = false;
+  TestEncode();
 }
 
 TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 63d8f44878..05ac9e1691 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -65,7 +65,11 @@ typedef struct vp9_extracfg {
 } vp9_extracfg;
 
 static struct vp9_extracfg default_extra_cfg = {
-  0,                     // cpu_used
+#if CONFIG_REALTIME_ONLY
+  5,  // cpu_used
+#else
+  0,  // cpu_used
+#endif
   1,                     // enable_auto_alt_ref
   0,                     // noise_sensitivity
   0,                     // sharpness

From 10178e6161c9126a6178eadad122309f8372fb0d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 16 Jun 2022 18:43:44 -0700
Subject: [PATCH 338/926] vp9_encode_sb_row: remove a branch
 w/CONFIG_REALTIME_ONLY

replace the check on use_nonrd_pick_mode with an assert. this is only a
start, there are many branches that could be removed that check mode ==
REALTIME, etc. with this configuration.

Bug: webm:1773
Change-Id: I38cf9f83e7c085eb8e87d5cf6db7dc75359b611b
(cherry picked from commit 08b86d76224453ef9cbab4b10a48617715d9a14e)
---
 vp9/encoder/vp9_encodeframe.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5f08fa6f60..a9f392bf51 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -5856,9 +5856,12 @@ void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row,
   get_start_tok(cpi, tile_row, tile_col, mi_row, &tok);
   cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok;
 
+#if CONFIG_REALTIME_ONLY
+  assert(cpi->sf.use_nonrd_pick_mode);
+  encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#else
   if (cpi->sf.use_nonrd_pick_mode)
     encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
-#if !CONFIG_REALTIME_ONLY
   else
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
 #endif

From ec58d55c3af91f9db2511fb872bdc19868cbed92 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 24 Jun 2022 11:57:42 -0700
Subject: [PATCH 339/926] L2E: Distinguish fixed and active gf_interval

min/max_gf_interval is fixed and can be passed from the command line.
It must satisfy the level constraints.

active_min/max_gf_interval might be changing based on
min/max_gf_interval. It is determined per GOP.

Change-Id: If456c691c97a8b4c946859c05cedd39ca7defa9c
---
 test/vp9_ext_ratectrl_test.cc |  9 +++++----
 vp9/encoder/vp9_firstpass.c   |  6 ++++--
 vpx/vpx_ext_ratectrl.h        | 16 +++++++++++++++-
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 66d4233766..68703b7e94 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -33,8 +33,7 @@ constexpr int kFixedGOPSize = 9;
 constexpr int kMaxLagInFrames = 25;
 constexpr int kDefaultMinGfInterval = 4;
 constexpr int kDefaultMaxGfInterval = 16;
-// The two pass rate control does not respect the input
-// min_gf_interval and max_gf_interval.
+// The active gf interval might change for each GOP
 // See function "get_active_gf_inverval_range".
 // The numbers below are from manual inspection.
 constexpr int kReadMinGfInterval = 5;
@@ -267,8 +266,10 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
   EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
-  EXPECT_EQ(gop_info->min_gf_interval, kReadMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kReadMaxGfInterval);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval);
+  EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval);
   EXPECT_EQ(gop_info->allow_alt_ref, 1);
   if (gop_info->is_key_frame) {
     EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e121ac80e1..4682cc0030 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2762,8 +2762,10 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     vpx_codec_err_t codec_status;
     vpx_rc_gop_decision_t gop_decision;
     vpx_rc_gop_info_t gop_info;
-    gop_info.min_gf_interval = active_gf_interval.min;
-    gop_info.max_gf_interval = active_gf_interval.max;
+    gop_info.min_gf_interval = rc->min_gf_interval;
+    gop_info.max_gf_interval = rc->max_gf_interval;
+    gop_info.active_min_gf_interval = active_gf_interval.min;
+    gop_info.active_max_gf_interval = active_gf_interval.max;
     gop_info.allow_alt_ref = allow_alt_ref;
     gop_info.is_key_frame = is_key_frame;
     gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index b57148c69b..c3309b0f26 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -25,7 +25,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (4)
+#define VPX_EXT_RATECTRL_ABI_VERSION (5)
 
 /*!\brief The control type of the inference API.
  * In VPX_RC_QP mode, the external rate control model determines the
@@ -287,12 +287,26 @@ typedef struct vpx_rc_config {
 typedef struct vpx_rc_gop_info {
   /*!
    * Minimum allowed gf interval, fixed for the whole clip.
+   * Note that it will be modified to match vp9's level constraints
+   * in the encoder.
+   * The level constraint is defined in vp9_encoder.c:
+   * const Vp9LevelSpec vp9_level_defs[VP9_LEVELS].
    */
   int min_gf_interval;
   /*!
    * Maximum allowed gf interval, fixed for the whole clip.
    */
   int max_gf_interval;
+  /*!
+   * Minimum allowed gf interval for the current GOP, determined
+   * by the encoder.
+   */
+  int active_min_gf_interval;
+  /*!
+   * Maximum allowed gf interval for the current GOP, determined
+   * by the encoder.
+   */
+  int active_max_gf_interval;
   /*!
    * Whether to allow the use of alt ref, can be changed per gop.
    */

From 03265cd42b3783532de72f2ded5436652e6f5ce3 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 28 Jun 2022 14:38:49 -0400
Subject: [PATCH 340/926] Replace date with version and release from README

CHANGELOG has the date.

Bug: webm:1752
Change-Id: I2888ce2afed8619f043eee1e9ca23bdf9d75e607
---
 README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README b/README
index a083ebf90e..477a145ba3 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README - 08 March 2021
+v1.12.0 Torrent Duck
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 

From b355ab504667c352d96ab70bcb92165b8fc32813 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 28 Jun 2022 10:24:57 -0400
Subject: [PATCH 341/926] Add vp8_ prefix for quantize_lsx.c

Duplicate name as vpx_dsp/loongarch/quantize_lsx.c
Chromium update script fails.

Bug: webm:1755
Change-Id: Ifb956c2292d909496eb2b9e1833993f1b021b07e
---
 vp8/encoder/loongarch/{quantize_lsx.c => vp8_quantize_lsx.c} | 0
 vp8/vp8cx.mk                                                 | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename vp8/encoder/loongarch/{quantize_lsx.c => vp8_quantize_lsx.c} (100%)

diff --git a/vp8/encoder/loongarch/quantize_lsx.c b/vp8/encoder/loongarch/vp8_quantize_lsx.c
similarity index 100%
rename from vp8/encoder/loongarch/quantize_lsx.c
rename to vp8/encoder/loongarch/vp8_quantize_lsx.c
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 5744cbabcc..b4b3fda9ea 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -125,8 +125,8 @@ VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 endif
 
 # common (loongarch LSX intrinsics)
-VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c
 VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c
 VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c
 
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))

From 896b59f44d63a789c9e34c394e9380323e538692 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 28 Jun 2022 11:22:48 -0700
Subject: [PATCH 342/926] rtc-svc: Fix to make SVC work for Profile 1

Added datarate unittest for 4:4:4 and 4:2:2 input,
for spatial and temporal layers.

Fix is needed in vp9_set_literal_size():
the sampling_x/y should be passed into update_inital_width(),
othewise sampling_x/y = 1/1 (4:2:0) was forced.
vp9_set_literal_size() is only called by the svc and
on dynamic resize.

Fix issue with the normative optimized scaler:
UV width/height was assumed to be 1/2 of Y, for
the ssse and neon code.

Also fix to assert for the scaled width/height:
in case scaled width/height is odd it should be
incremented by 1 (make it even).

Change-Id: I3a2e40effa53c505f44ef05aaa3132e1b7f57dd5
---
 test/svc_datarate_test.cc                   | 81 +++++++++++++++++++--
 vp9/encoder/arm/neon/vp9_frame_scale_neon.c |  4 +-
 vp9/encoder/vp9_encoder.c                   |  6 +-
 vp9/encoder/x86/vp9_frame_scale_ssse3.c     |  4 +-
 4 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 291cb01280..f3f76a0d33 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -548,13 +548,16 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       }
 
       if (!single_layer_resize_) {
-        ASSERT_EQ(pkt->data.frame.width[sl],
-                  top_sl_width_ * svc_params_.scaling_factor_num[sl] /
-                      svc_params_.scaling_factor_den[sl]);
-
-        ASSERT_EQ(pkt->data.frame.height[sl],
-                  top_sl_height_ * svc_params_.scaling_factor_num[sl] /
-                      svc_params_.scaling_factor_den[sl]);
+        unsigned int scaled_width = top_sl_width_ *
+                                    svc_params_.scaling_factor_num[sl] /
+                                    svc_params_.scaling_factor_den[sl];
+        if (scaled_width % 2 != 0) scaled_width += 1;
+        ASSERT_EQ(pkt->data.frame.width[sl], scaled_width);
+        unsigned int scaled_height = top_sl_height_ *
+                                     svc_params_.scaling_factor_num[sl] /
+                                     svc_params_.scaling_factor_den[sl];
+        if (scaled_height % 2 != 0) scaled_height += 1;
+        ASSERT_EQ(pkt->data.frame.height[sl], scaled_height);
       } else if (superframe_count_ > 0) {
         if (pkt->data.frame.width[sl] < prev_frame_width[sl] &&
             pkt->data.frame.height[sl] < prev_frame_height[sl])
@@ -678,6 +681,70 @@ class DatarateOnePassCbrSvcSingleBR
   }
 };
 
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for 4:4:4 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+  cfg_.g_profile = 1;
+  cfg_.g_bit_depth = VPX_BITS_8;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 352;
+  top_sl_height_ = 288;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers, for 4:2:2 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) {
+  SetSvcConfig(2, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20);
+  cfg_.g_profile = 1;
+  cfg_.g_bit_depth = VPX_BITS_8;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
 // temporal layer, with screen content mode on and same speed setting for all
 // layers.
diff --git a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
index e46f789bac..69b8cfffd7 100644
--- a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
+++ b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -710,8 +710,8 @@ void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
-  const int dst_uv_w = dst_w / 2;
-  const int dst_uv_h = dst_h / 2;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
   int scaled = 0;
 
   // phase_scaler is usually 0 or 8.
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 85bd706629..371779e772 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -8155,9 +8155,11 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
   VP9_COMMON *cm = &cpi->common;
 #if CONFIG_VP9_HIGHBITDEPTH
-  update_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+  update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x,
+                       cpi->common.subsampling_y);
 #else
-  update_initial_width(cpi, 0, 1, 1);
+  update_initial_width(cpi, 0, cpi->common.subsampling_x,
+                       cpi->common.subsampling_y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 7685e7bc3e..bf0e8b121f 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -754,8 +754,8 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
-  const int dst_uv_w = dst_w / 2;
-  const int dst_uv_h = dst_h / 2;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
   int scaled = 0;
 
   // phase_scaler is usually 0 or 8.

From 711bef67400f096416cb1ba7f6560e533871490f Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 29 Jun 2022 10:35:36 -0700
Subject: [PATCH 343/926] rtc: Add svc test for profile 2 10/12 bit

Add TODO to fix the superframe parser
for 10/12 bit.

Change-Id: Ib76c4daa0ff2f516510829ead6a397c89abba2f3
---
 test/svc_datarate_test.cc | 82 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index f3f76a0d33..51e90e776c 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -745,6 +745,88 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) {
 #endif
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 10bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20);
+  cfg_.g_profile = 2;
+  cfg_.g_bit_depth = VPX_BITS_10;
+  cfg_.g_input_bit_depth = VPX_BITS_10;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(marpan/jianj): Comment out the rate-target checking for now
+  // as superframe parsing to get frame size needs to be fixed for
+  // high bitdepth.
+  /*
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+  */
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 12bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20);
+  cfg_.g_profile = 2;
+  cfg_.g_bit_depth = VPX_BITS_12;
+  cfg_.g_input_bit_depth = VPX_BITS_12;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(marpan/jianj): Comment out the rate-target checking for now
+  // as superframe parsing to get frame size needs to be fixed for
+  // high bitdepth.
+  /*
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+  */
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
 // temporal layer, with screen content mode on and same speed setting for all
 // layers.

From dbac8e01e05ad3d1b47887b1ac864339115aa721 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 30 Jun 2022 10:53:15 -0400
Subject: [PATCH 344/926] ABI compatibility to CHANGELOG for prev releases.

Bug: webm:1757
Change-Id: I19576aa0bc065045dcb0eaf770ae5b0d9ac9d684
---
 CHANGELOG | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index cd4e8ba43a..4f5dcbd44e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -36,6 +36,7 @@
   levels, and includes several improvements to NEON and numerous bug fixes.
 
   - Upgrading:
+    This release is ABI incompatible with the previous release.
     New codec control is added to get quantization parameters and loop filter
     levels.
 
@@ -61,6 +62,7 @@
   well as numerous bug fixes.
 
   - Upgrading:
+    This release is ABI incompatible with the previous release.
     New codec control is added to disable loopfilter for VP9.
 
     New encoder control is added to disable feature to increase Q on overshoot
@@ -91,6 +93,7 @@
   well as incremental improvements.
 
   - Upgrading:
+    This release is ABI compatible with the previous release.
     NV12 support is added to this release.
     A new interface is added for VP9 rate control. The new library libvp9rc.a
     must be linked by applications.
@@ -114,12 +117,14 @@
   This release collects incremental improvements to many aspects of the library.
 
   - Upgrading:
+    This release is ABI compatible with the previous release.
     ARCH_* defines have been removed in favor of VPX_ARCH_*.
 
 2019-07-15 v1.8.1 "Orpington Duck"
   This release collects incremental improvements to many aspects of the library.
 
   - Upgrading:
+    This release is ABI incompatible with the previous release.
     VP8E_SET_CPUUSED now accepts values up to 9 for vp9.
     VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E).
     The --sdk-path option has been removed. If you were using it to build for
@@ -138,7 +143,8 @@
   This release focused on encoding performance for realtime and VOD use cases.
 
   - Upgrading:
-    This adds and improves several vp9 controls. Most are related to SVC:
+    This release is ABI incompatible with the previous release. This adds and
+    improves several vp9 controls. Most are related to SVC:
       VP9E_SET_SVC_FRAME_DROP_LAYER:
         - Frame dropping in SVC.
       VP9E_SET_SVC_INTER_LAYER_PRED:

From 5b530fc962bcb8a51bbf03f5fbc2912f21b86e70 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 30 Jun 2022 13:48:56 -0400
Subject: [PATCH 345/926] Fix bug with smaller width bigger size

Bug: webm:1642

Change-Id: I831b7701495eebeeff6bdc0b570f737bb6d536c6
---
 test/resize_test.cc          | 11 +++--------
 vp9/common/vp9_alloccommon.c | 15 ++++++---------
 vp9/encoder/vp9_encoder.c    | 27 +++++++++++++++++++++++++--
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index 212ff46975..1e5e166f7c 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -101,11 +101,8 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
       *h = initial_h;
       return;
     }
-    if (frame < 100) {
-      *w = initial_w * 7 / 10;
-      *h = initial_h * 16 / 10;
-      return;
-    }
+    *w = initial_w * 7 / 10;
+    *h = initial_h * 16 / 10;
     return;
   }
   if (frame < 10) {
@@ -578,9 +575,7 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   }
 }
 
-// TODO(https://crbug.com/webm/1642): This causes a segfault in
-// init_encode_frame_mb_context().
-TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) {
+TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) {
   ResizingVideoSource video;
   video.flag_codec_ = true;
   video.smaller_width_larger_size_ = true;
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index faad657a08..c27fe6477b 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -132,15 +132,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
   if (cm->mi_alloc_size < new_mi_size) {
     cm->free_mi(cm);
     if (cm->alloc_mi(cm, new_mi_size)) goto fail;
-  }
-
-  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
-    // Create the segmentation map structure and set to 0.
-    free_seg_map(cm);
-    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
-  }
-
-  if (cm->above_context_alloc_cols < cm->mi_cols) {
     vpx_free(cm->above_context);
     cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
         2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
@@ -154,6 +145,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
+  }
+
   if (vp9_alloc_loop_filter(cm)) goto fail;
 
   return 0;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 371779e772..d58e8a3123 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1972,6 +1972,17 @@ static void alloc_copy_partition_data(VP9_COMP *cpi) {
   }
 }
 
+static void free_copy_partition_data(VP9_COMP *cpi) {
+  vpx_free(cpi->prev_partition);
+  cpi->prev_partition = NULL;
+  vpx_free(cpi->prev_segment_id);
+  cpi->prev_segment_id = NULL;
+  vpx_free(cpi->prev_variance_low);
+  cpi->prev_variance_low = NULL;
+  vpx_free(cpi->copied_frame_cnt);
+  cpi->copied_frame_cnt = NULL;
+}
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2051,6 +2062,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
     if (cm->mi_alloc_size < new_mi_size) {
       vp9_free_context_buffers(cm);
+      vp9_free_pc_tree(&cpi->td);
+      vpx_free(cpi->mbmi_ext_base);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -2069,8 +2082,18 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     update_frame_size(cpi);
 
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
-    memset(cpi->consec_zero_mv, 0,
-           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+    vpx_free(cpi->consec_zero_mv);
+    CHECK_MEM_ERROR(
+        cm, cpi->consec_zero_mv,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
+
+    vpx_free(cpi->skin_map);
+    CHECK_MEM_ERROR(
+        cm, cpi->skin_map,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+
+    free_copy_partition_data(cpi);
+    alloc_copy_partition_data(cpi);
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_reset_resize(cpi);
     rc->rc_1_frame = 0;

From 933b6b90a583b593efd8acb644603ab189226309 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 6 Jul 2022 15:06:51 -0700
Subject: [PATCH 346/926] Revert "Fix bug with smaller width bigger size"

This reverts commit 5b530fc962bcb8a51bbf03f5fbc2912f21b86e70.

This fixes memory related fuzzer failures in the decoder.

Bug: webm:1642
Bug: oss-fuzz:48609
Bug: oss-fuzz:48629
Bug: oss-fuzz:48632
Bug: oss-fuzz:48638
Bug: oss-fuzz:48639
Bug: oss-fuzz:48651
Bug: oss-fuzz:48657
Bug: oss-fuzz:48659
Bug: oss-fuzz:48660
Bug: oss-fuzz:48661
Bug: oss-fuzz:48680
Bug: oss-fuzz:48686
Bug: oss-fuzz:48697
Bug: oss-fuzz:48706
Bug: oss-fuzz:48712
Bug: oss-fuzz:48717
Bug: oss-fuzz:48728
Bug: oss-fuzz:48732
Bug: oss-fuzz:48780
Bug: oss-fuzz:48781
Bug: oss-fuzz:48782
Bug: oss-fuzz:48785
Change-Id: I67a8539a3083f00eec1164fef5c6a8bc209f91fc
---
 test/resize_test.cc          | 11 ++++++++---
 vp9/common/vp9_alloccommon.c | 15 +++++++++------
 vp9/encoder/vp9_encoder.c    | 27 ++-------------------------
 3 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index 1e5e166f7c..212ff46975 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -101,8 +101,11 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
       *h = initial_h;
       return;
     }
-    *w = initial_w * 7 / 10;
-    *h = initial_h * 16 / 10;
+    if (frame < 100) {
+      *w = initial_w * 7 / 10;
+      *h = initial_h * 16 / 10;
+      return;
+    }
     return;
   }
   if (frame < 10) {
@@ -575,7 +578,9 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   }
 }
 
-TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) {
+// TODO(https://crbug.com/webm/1642): This causes a segfault in
+// init_encode_frame_mb_context().
+TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) {
   ResizingVideoSource video;
   video.flag_codec_ = true;
   video.smaller_width_larger_size_ = true;
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index c27fe6477b..faad657a08 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -132,6 +132,15 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
   if (cm->mi_alloc_size < new_mi_size) {
     cm->free_mi(cm);
     if (cm->alloc_mi(cm, new_mi_size)) goto fail;
+  }
+
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
+  }
+
+  if (cm->above_context_alloc_cols < cm->mi_cols) {
     vpx_free(cm->above_context);
     cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
         2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
@@ -145,12 +154,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
-  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
-    // Create the segmentation map structure and set to 0.
-    free_seg_map(cm);
-    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
-  }
-
   if (vp9_alloc_loop_filter(cm)) goto fail;
 
   return 0;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index d58e8a3123..371779e772 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1972,17 +1972,6 @@ static void alloc_copy_partition_data(VP9_COMP *cpi) {
   }
 }
 
-static void free_copy_partition_data(VP9_COMP *cpi) {
-  vpx_free(cpi->prev_partition);
-  cpi->prev_partition = NULL;
-  vpx_free(cpi->prev_segment_id);
-  cpi->prev_segment_id = NULL;
-  vpx_free(cpi->prev_variance_low);
-  cpi->prev_variance_low = NULL;
-  vpx_free(cpi->copied_frame_cnt);
-  cpi->copied_frame_cnt = NULL;
-}
-
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2062,8 +2051,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
     if (cm->mi_alloc_size < new_mi_size) {
       vp9_free_context_buffers(cm);
-      vp9_free_pc_tree(&cpi->td);
-      vpx_free(cpi->mbmi_ext_base);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -2082,18 +2069,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     update_frame_size(cpi);
 
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
-    vpx_free(cpi->consec_zero_mv);
-    CHECK_MEM_ERROR(
-        cm, cpi->consec_zero_mv,
-        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
-
-    vpx_free(cpi->skin_map);
-    CHECK_MEM_ERROR(
-        cm, cpi->skin_map,
-        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
-
-    free_copy_partition_data(cpi);
-    alloc_copy_partition_data(cpi);
+    memset(cpi->consec_zero_mv, 0,
+           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_reset_resize(cpi);
     rc->rc_1_frame = 0;

From ba56eafb5742e5c28b7a99b5442698e9a3a61683 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Thu, 7 Jul 2022 06:34:11 -0700
Subject: [PATCH 347/926] VPX: Add quantize speed test for ref vs opt.

Bug: b/237714063

Change-Id: I4304ba8d976fed3613e28442983b04a9cfc15b79
---
 test/vp9_quantize_test.cc | 226 ++++++++++++++++++++++----------------
 1 file changed, 134 insertions(+), 92 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index ca1062a76f..b14a20cfcc 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -67,6 +67,45 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
   fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
 }
 
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+                          int16_t *quant, int16_t *quant_shift,
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V.
+  constexpr int kMaxQRoundingFactorFp = 64;
+
+  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
+    // Values determined by deconstructing vp9_init_quantizer().
+    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+    // values or U/V values of any bit depth. This is because y_delta is not
+    // factored into the vp9_ac_quant() call.
+    zbin[j] = rnd->RandRange(1200);
+
+    // round may be up to 685 for Y values or 914 for U/V.
+    round[j] = rnd->RandRange(914);
+    // quant ranges from 1 to -32703
+    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+    // quant_shift goes up to 1 << 16.
+    quant_shift[j] = rnd->RandRange(16384);
+    // dequant maxes out at 1828 for all cases.
+    dequant[j] = rnd->RandRange(1828);
+  }
+  for (int j = 2; j < 8; j++) {
+    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
+    round[j] = round[1];
+    quant[j] = quant[1];
+    quant_shift[j] = quant_shift[1];
+    dequant[j] = dequant[1];
+  }
+}
+
 class VP9QuantizeBase : public AbstractBench {
  public:
   VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
@@ -148,6 +187,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 
  protected:
   virtual void Run();
+  void Speed(bool is_median);
   const QuantizeFunc quantize_op_;
   const QuantizeFunc ref_quantize_op_;
 };
@@ -159,6 +199,98 @@ void VP9QuantizeTest::Run() {
                scan_->iscan);
 }
 
+void VP9QuantizeTest::Speed(bool is_median) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
+  TX_SIZE starting_sz, ending_sz;
+
+  if (max_size_ == 16) {
+    starting_sz = TX_4X4;
+    ending_sz = TX_16X16;
+  } else {
+    starting_sz = TX_32X32;
+    ending_sz = TX_32X32;
+  }
+
+  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+    // zbin > coeff, zbin < coeff.
+    for (int i = 0; i < 2; ++i) {
+      // TX_TYPE defines the scan order. That is not relevant to the speed test.
+      // Pick the first one.
+      const TX_TYPE tx_type = DCT_DCT;
+      count_ = (4 << sz) * (4 << sz);
+      scan_ = &vp9_scan_orders[sz][tx_type];
+
+      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+
+      if (i == 0) {
+        // When |coeff values| are less than zbin the results are 0.
+        int threshold = 100;
+        if (max_size_ == 32) {
+          // For 32x32, the threshold is halved. Double it to keep the values
+          // from clearing it.
+          threshold = 200;
+        }
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+        coeff_.Set(&rnd, -99, 99);
+      } else if (i == 1) {
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+        coeff_.Set(&rnd, -500, 500);
+      }
+      if (is_median) {
+        RunNTimes(10000000 / count_);
+        const char *type =
+            (i == 0) ? "Bypass calculations " : "Full calculations ";
+        char block_size[16];
+        snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+        char title[100];
+        snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+        PrintMedian(title);
+      } else {
+        Buffer<tran_low_t> ref_qcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_qcoeff.Init());
+        Buffer<tran_low_t> ref_dqcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_dqcoeff.Init());
+        uint16_t ref_eob = 0;
+
+        const int kNumTests = 5000000;
+        vpx_usec_timer timer, simd_timer;
+
+        vpx_usec_timer_start(&timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
+                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                           ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                           scan_->scan, scan_->iscan);
+        }
+        vpx_usec_timer_mark(&timer);
+
+        vpx_usec_timer_start(&simd_timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
+                       scan_->scan, scan_->iscan);
+        }
+        vpx_usec_timer_mark(&simd_timer);
+
+        const int elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&timer));
+        const int simd_elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&simd_timer));
+        printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time,
+               simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time));
+      }
+    }
+  }
+}
+
 // This quantizer compares the AC coefficients to the quantization step size to
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
@@ -254,45 +386,6 @@ void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
 }
 
-void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
-                          int16_t *quant, int16_t *quant_shift,
-                          int16_t *dequant, int16_t *round_fp,
-                          int16_t *quant_fp) {
-  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
-  const int max_qrounding_factor_fp = 64;
-
-  for (int j = 0; j < 2; j++) {
-    // The range is 4 to 1828 in the VP9 tables.
-    const int qlookup = rnd->RandRange(1825) + 4;
-    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
-    quant_fp[j] = (1 << 16) / qlookup;
-
-    // Values determined by deconstructing vp9_init_quantizer().
-    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
-    // values or U/V values of any bit depth. This is because y_delta is not
-    // factored into the vp9_ac_quant() call.
-    zbin[j] = rnd->RandRange(1200);
-
-    // round may be up to 685 for Y values or 914 for U/V.
-    round[j] = rnd->RandRange(914);
-    // quant ranges from 1 to -32703
-    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
-    // quant_shift goes up to 1 << 16.
-    quant_shift[j] = rnd->RandRange(16384);
-    // dequant maxes out at 1828 for all cases.
-    dequant[j] = rnd->RandRange(1828);
-  }
-  for (int j = 2; j < 8; j++) {
-    zbin[j] = zbin[1];
-    round_fp[j] = round_fp[1];
-    quant_fp[j] = quant_fp[1];
-    round[j] = round[1];
-    quant[j] = quant[1];
-    quant_shift[j] = quant_shift[1];
-    dequant[j] = dequant[1];
-  }
-}
-
 TEST_P(VP9QuantizeTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   ASSERT_TRUE(coeff_.Init());
@@ -403,60 +496,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
   }
 }
 
-TEST_P(VP9QuantizeTest, DISABLED_Speed) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  ASSERT_TRUE(coeff_.Init());
-  ASSERT_TRUE(qcoeff_.Init());
-  ASSERT_TRUE(dqcoeff_.Init());
-  TX_SIZE starting_sz, ending_sz;
-
-  if (max_size_ == 16) {
-    starting_sz = TX_4X4;
-    ending_sz = TX_16X16;
-  } else {
-    starting_sz = TX_32X32;
-    ending_sz = TX_32X32;
-  }
+TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); }
 
-  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
-    // zbin > coeff, zbin < coeff.
-    for (int i = 0; i < 2; ++i) {
-      // TX_TYPE defines the scan order. That is not relevant to the speed test.
-      // Pick the first one.
-      const TX_TYPE tx_type = DCT_DCT;
-      count_ = (4 << sz) * (4 << sz);
-      scan_ = &vp9_scan_orders[sz][tx_type];
-
-      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
-                           quant_fp_ptr_);
-
-      if (i == 0) {
-        // When |coeff values| are less than zbin the results are 0.
-        int threshold = 100;
-        if (max_size_ == 32) {
-          // For 32x32, the threshold is halved. Double it to keep the values
-          // from clearing it.
-          threshold = 200;
-        }
-        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
-        coeff_.Set(&rnd, -99, 99);
-      } else if (i == 1) {
-        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
-        coeff_.Set(&rnd, -500, 500);
-      }
-
-      RunNTimes(10000000 / count_);
-      const char *type =
-          (i == 0) ? "Bypass calculations " : "Full calculations ";
-      char block_size[16];
-      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
-      char title[100];
-      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
-      PrintMedian(title);
-    }
-  }
-}
+TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); }
 
 using std::make_tuple;
 

From 8f4d1890cbecbbec47c8baeba53e89a2b37ae3a8 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Wed, 6 Jul 2022 08:42:54 +0000
Subject: [PATCH 348/926] Revert "Revert "[NEON] Optimize
 vp9_diamond_search_sad() for NEON""

This reverts commit 9f1329f8ac88ea5d7c6ae5d6a57221c36cf85ac8
and fixes a dumb mistake in evaluation of vfcmv. Used vdupq_n_s16,
instead of vdupq_n_s32.

Change-Id: Ie236c878c166405c49bc0f93f6d63a6715534a0a
---
 vp9/common/vp9_rtcd_defs.pl                   |   2 +-
 .../arm/neon/vp9_diamond_search_sad_neon.c    | 322 ++++++++++++++++++
 vp9/vp9cx.mk                                  |   1 +
 3 files changed, 324 insertions(+), 1 deletion(-)
 create mode 100644 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 4da0b6675b..e6b65c96f0 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -175,7 +175,7 @@ ()
 # Motion search
 #
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx/;
+specialize qw/vp9_diamond_search_sad avx neon/;
 
 #
 # Apply temporal filter
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
new file mode 100644
index 0000000000..e56733d43e
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -0,0 +1,322 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+
+static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
+  // This is simplified from the C implementation to utilise that
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
+  return mv.as_int == 0 ? 0 : 1;
+}
+
+static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
+  assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
+  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
+         comp_cost[1][mv.as_mv.col];
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
+                          int sad_per_bit) {
+  const int_mv diff =
+      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+      VP9_PROB_COST_SHIFT);
+}
+
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
+                                const search_site_config *cfg, MV *ref_mv,
+                                MV *best_mv, int search_param, int sad_per_bit,
+                                int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+                                const MV *center_mv) {
+  static const uint32_t data[4] = { 0, 1, 2, 3 };
+  const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
+
+  const int32x4_t zero_s32 = vdupq_n_s32(0);
+  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+  const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
+  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+  const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
+
+  const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
+
+  const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
+  const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv =
+      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+  const int16x8_t vfcmv = vdupq_n_s16(fcenter_mv.as_int);
+
+  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
+  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what =
+      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if defined(__aarch64__)
+  int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+  int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+  unsigned int best_sad = INT_MAX;
+  int i, j, step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  // Check the starting position
+  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
+  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      int16x8_t v_diff_mv_w;
+      int8x16_t v_inside_d;
+      uint32x4_t v_outside_d;
+      int32x4_t v_cost_d, v_sad_d;
+#if defined(__aarch64__)
+      int64x2_t v_blocka[2];
+#else
+      int32x4_t v_blocka[1];
+      uint32x2_t horiz_max_0, horiz_max_1;
+#endif
+
+      uint32_t horiz_max;
+      // Compute the candidate motion vectors
+      const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
+      const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      int16x8_t v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = vreinterpretq_s8_u32(
+          vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
+                    vreinterpretq_s32_s16(v_these_mv_w)));
+
+      // If none of them are inside, then move on
+#if defined(__aarch64__)
+      horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
+#else
+      horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
+                             vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
+      horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
+      vst1_lane_u32(&horiz_max, horiz_max_1, 0);
+#endif
+      if (LIKELY(horiz_max == 0)) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d =
+          vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = vshrq_n_u32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = vabsq_s16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if defined(__aarch64__)  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
+        int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = vandq_s64(
+            v_bo10_q,
+            vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
+        v_bo32_q = vandq_s64(
+            v_bo32_q,
+            vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
+        // Compute the candidate addresses
+        v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
+        v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
+#else  // sizeof(intptr_t) == 4
+        int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
+        v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
+        v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                     in_what_stride, (uint32_t *)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        uint32_t cost[4];
+        int16_t __attribute__((aligned(16))) rowcol[8];
+        vst1q_s16(rowcol, v_diff_mv_w);
+
+        // Note: This is a use case for gather instruction
+        cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
+        cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
+        cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
+        cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
+
+        v_cost_d = vld1q_s32((int32_t *)cost);
+      }
+
+      // Now add in the joint cost
+      {
+        const uint32x4_t v_sel_d =
+            vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
+        const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
+            vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
+                     vreinterpretq_u8_s32(v_joint_cost_0_d),
+                     vreinterpretq_u8_s32(v_joint_cost_1_d)));
+        v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+      v_cost_d =
+          vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
+      v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
+      // Add the cost to the sad
+      v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        uint32_t local_best_sad;
+#if defined(__aarch64__)
+        local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
+#else
+        uint32x2_t horiz_min_0 =
+            vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
+                     vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
+        uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+        vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
+#endif
+
+        // Update the global minimum if the local minimum is smaller
+        if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+          uint32_t local_best_idx;
+          const uint32x4_t v_sel_d =
+              vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
+          uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
+          v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
+
+#if defined(__aarch64__)
+          local_best_idx = vminvq_u32(v_mask_d);
+#else
+          horiz_min_0 =
+              vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
+          horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+          vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
+#endif
+
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+#if defined(__aarch64__)
+    v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+    v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+
+    if (UNLIKELY(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 92a7fddb9d..c9afd9a347 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -113,6 +113,7 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c

From 873aab02adb54370c062726de056ea4f1888cb0e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 9 Jul 2022 14:49:54 -0700
Subject: [PATCH 349/926] vp8_macros_msa.h: avoid shadowing variables in
 defines

this avoids a warning with certain versions of gcc; observed with:
mipsisa32r6el-linux-gnu-gcc (Debian 10.2.1-6) 10.2.1 20210110

Change-Id: I8999f487a79a9d53133816d572054b2423330bcf
---
 vp8/common/mips/msa/vp8_macros_msa.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index ddc881a7fc..fde22f537c 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -69,12 +69,12 @@
 #else  // !(__mips == 64)
 #define LD(psrc)                                            \
   ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
+    const uint8_t *psrc_ld = (const uint8_t *)(psrc);       \
     uint32_t val0_m, val1_m;                                \
     uint64_t val_m = 0;                                     \
                                                             \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
+    val0_m = LW(psrc_ld);                                   \
+    val1_m = LW(psrc_ld + 4);                               \
                                                             \
     val_m = (uint64_t)(val1_m);                             \
     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \

From e2603ead67947cd534e0a593422bae6427451ad6 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 6 Jul 2022 08:51:52 -0700
Subject: [PATCH 350/926] VPX: Add vpx_quantize_b_avx2().

Up to 1.58x faster than vpx_quantize_b_avx() depending
on the size.

Bug: b/237714063

Change-Id: I595a6bb32ebee63f69f27b5a15322fdeae1bf70e
---
 test/vp9_quantize_test.cc    |  23 +++--
 vpx_dsp/vpx_dsp.mk           |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 vpx_dsp/x86/quantize_avx2.c  | 185 +++++++++++++++++++++++++++++++++++
 4 files changed, 201 insertions(+), 10 deletions(-)
 create mode 100644 vpx_dsp/x86/quantize_avx2.c

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index b14a20cfcc..c7ce13ff22 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -241,14 +241,16 @@ void VP9QuantizeTest::Speed(bool is_median) {
         for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
         coeff_.Set(&rnd, -500, 500);
       }
+
+      const char *type =
+          (i == 0) ? "Bypass calculations " : "Full calculations ";
+      char block_size[16];
+      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+      char title[100];
+      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+
       if (is_median) {
         RunNTimes(10000000 / count_);
-        const char *type =
-            (i == 0) ? "Bypass calculations " : "Full calculations ";
-        char block_size[16];
-        snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
-        char title[100];
-        snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
         PrintMedian(title);
       } else {
         Buffer<tran_low_t> ref_qcoeff =
@@ -284,8 +286,9 @@ void VP9QuantizeTest::Speed(bool is_median) {
             static_cast<int>(vpx_usec_timer_elapsed(&timer));
         const int simd_elapsed_time =
             static_cast<int>(vpx_usec_timer_elapsed(&simd_timer));
-        printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time,
-               simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time));
+        printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title,
+               elapsed_time, simd_elapsed_time,
+               ((float)elapsed_time / simd_elapsed_time));
       }
     }
   }
@@ -575,7 +578,9 @@ INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
     ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true)));
+                                 16, true),
+                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false)));
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 13999af04d..8e0100c310 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -326,6 +326,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.c
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/quantize_avx2.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/quantize_lsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d3c668f9ae..7ecd3ac90f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -711,7 +711,7 @@ ()
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/;
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/;
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
new file mode 100644
index 0000000000..e1c6e944ce
--- /dev/null
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void load_b_values_avx2(
+    const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+    __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+    __m256i *shift) {
+  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i
+load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+#else
+  // typedef int16_t tran_low_t;
+  return _mm256_loadu_si256((const __m256i *)coeff_ptr);
+#endif
+}
+
+static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                                     tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+  _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+#else
+  // typedef int16_t tran_low_t;
+  _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
+#endif
+}
+
+static VPX_FORCE_INLINE __m256i
+quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+              tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+              __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return _mm256_setzero_si256();
+  }
+  {
+    // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+    const __m256i v_tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+
+    const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+    const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+    const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+    const __m256i v_nz_mask =
+        _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+    const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+#if CONFIG_VP9_HIGHBITDEPTH
+    const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+    const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
+
+    const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
+    const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
+#else
+    const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+#endif
+
+    store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+    store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+#endif
+    return v_nz_mask;
+  }
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
+  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+#else
+  // typedef int16_t tran_low_t;
+  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask);
+#endif
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+  __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+  __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
+  __m256i v_eobmax = _mm256_set1_epi16(0);
+  intptr_t count;
+  (void)scan;
+
+  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+                     &v_quant_shift);
+  // Do DC and first 15 AC.
+  v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                            &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+  v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (count = n_coeffs - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                              &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+    v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}

From cc8236f1d281701afbb05b298769dc6de41500d8 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Wed, 13 Jul 2022 16:54:30 +0000
Subject: [PATCH 351/926] Actually include the fix for commit 8f4d1890c.

Change-Id: I6780f610151f2e092da525ff064d4b69f74fa61b
---
 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index e56733d43e..33753f77b0 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -99,7 +99,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
 
   const int_mv fcenter_mv =
       pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
-  const int16x8_t vfcmv = vdupq_n_s16(fcenter_mv.as_int);
+  const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
 
   const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
   const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);

From 168b312774166958897f727196a59ee8ad423e78 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 13 Jul 2022 21:54:22 -0700
Subject: [PATCH 352/926] vpxenc: fix --disable-loopfilter help alignment

Change-Id: I34444e6437ca0e735d6db07bf98bfa4741ad2c01
---
 vpxenc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vpxenc.c b/vpxenc.c
index 7eff97b132..61672acadd 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -524,9 +524,12 @@ static const arg_def_t row_mt =
 
 static const arg_def_t disable_loopfilter =
     ARG_DEF(NULL, "disable-loopfilter", 1,
-            "Control Loopfilter in VP9\n"
+            "Control Loopfilter in VP9:\n"
+            "                                          "
             "0: Loopfilter on for all frames (default)\n"
+            "                                          "
             "1: Loopfilter off for non reference frames\n"
+            "                                          "
             "2: Loopfilter off for all frames");
 #endif
 

From 68d9e7aa2f281a4be6b8f3efb24b67b2bfd1f67d Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Thu, 14 Jul 2022 11:41:11 -0700
Subject: [PATCH 353/926] L2E: Update the description of allow_alt_ref

It is fixed per each encoding and can not be changed per GOP.

Change-Id: I5905b712437142f2274bfa674ceef6093495457f
---
 vpx/vpx_ext_ratectrl.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index c3309b0f26..b6c950d87e 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -308,7 +308,9 @@ typedef struct vpx_rc_gop_info {
    */
   int active_max_gf_interval;
   /*!
-   * Whether to allow the use of alt ref, can be changed per gop.
+   * Whether to allow the use of alt ref, determined by the encoder.
+   * It is fixed for the entire encode.
+   * See function "is_altref_enabled" in vp9_encoder.h.
    */
   int allow_alt_ref;
   /*!

From a5ead0427c21ef15b12f9b582e860b77f714c622 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 18 Jul 2022 18:56:03 -0700
Subject: [PATCH 354/926] vpx_int_pro_row_c: add an assert for height

this quiets a static analysis warning with clang 11:

vpx_dsp/avg.c:353:15: warning: Assigned value is garbage or undefined
[core.uninitialized.Assign]
    hbuf[idx] /= norm_factor;
              ^  ~~~~~~~~~~~

the same fix was applied in libaom:
1ad0889bc aom_int_pro_row_c: add an assert for height

Bug: b/229626362
Change-Id: Ic8a249f866b33b02ec9f378581e51ac104d97169
---
 vpx_dsp/avg.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 1c45e8a73d..9540154074 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <assert.h>
 #include <stdlib.h>
 
 #include "./vpx_dsp_rtcd.h"
@@ -344,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
+  assert(height >= 2);
   for (idx = 0; idx < 16; ++idx) {
     int i;
     hbuf[idx] = 0;

From 53dd1e8e785ea42fa88499dbfd0c2c9dcd055833 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 18 Jul 2022 19:00:49 -0700
Subject: [PATCH 355/926] avg_intrin_{sse2,avg2}: rm dead store in hadamard_8x8

this quiets a couple static analysis warnings with clang 11:

vpx_dsp/x86/avg_intrin_sse2.c:278:45: warning: Although the value stored
to 'src_diff' is used in the enclosing expression, the value is never
actually read from 'src_diff' [deadcode.DeadStores]
  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
                                            ^           ~~~~~~~~~~
vpx_dsp/x86/avg_intrin_avx2.c:307:49: warning: Although the value stored
to 'src_diff' is used in the enclosing expression, the value is never
actually read from 'src_diff' [deadcode.DeadStores]
  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
                                                ^           ~~~~~~~~~~

Bug: b/229626362
Change-Id: I4b0201bd39775885df0afc03fa5da70910b9dad6
---
 vpx_dsp/x86/avg_intrin_avx2.c | 2 +-
 vpx_dsp/x86/avg_intrin_sse2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index 3f4f577a21..d93e6ccae5 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -304,7 +304,7 @@ static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
   src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
   src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
 
   hadamard_col8x2_avx2(src, 0);
   hadamard_col8x2_avx2(src, 1);
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 9da2f34c9b..0c4919f6d8 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -275,7 +275,7 @@ static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
 
   hadamard_col8_sse2(src, 0);
   hadamard_col8_sse2(src, 1);

From 414b4f05124b27c512a63c78c3057934850bc941 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Tue, 12 Jul 2022 13:22:35 -0700
Subject: [PATCH 356/926] VPX: Add vpx_quantize_b_32x32_avx2().

Up to 1.36x faster than vpx_quantize_b_32x32_avx() for full
calculations. Up to 1.29x faster for VP9_HIGHBITDEPTH builds.

Bug: b/237714063

Change-Id: I97aa6a18d4dc2f3187b76800f91bbba7be447ef1
---
 test/vp9_quantize_test.cc    |   5 +-
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 vpx_dsp/x86/quantize_avx2.c  | 117 ++++++++++++++++++++++++++++++++++-
 3 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index c7ce13ff22..3e0dd77396 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -580,7 +580,10 @@ INSTANTIATE_TEST_SUITE_P(
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true),
                       make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false)));
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_avx2,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false)));
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 7ecd3ac90f..beb594f9f3 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -714,7 +714,7 @@ ()
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/;
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index e1c6e944ce..6fd5174876 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -18,15 +18,25 @@ static VPX_FORCE_INLINE void load_b_values_avx2(
     const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
     __m256i *round, const int16_t *quant_ptr, __m256i *quant,
     const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
-    __m256i *shift) {
+    __m256i *shift, int log_scale) {
   *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
   *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *zbin = _mm256_add_epi16(*zbin, rnd);
+    *zbin = _mm256_srai_epi16(*zbin, log_scale);
+  }
   // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
   // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
   *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
 
   *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
   *round = _mm256_permute4x64_epi64(*round, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *round = _mm256_add_epi16(*round, rnd);
+    *round = _mm256_srai_epi16(*round, log_scale);
+  }
 
   *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
   *quant = _mm256_permute4x64_epi64(*quant, 0x54);
@@ -151,13 +161,13 @@ void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          uint16_t *eob_ptr, const int16_t *scan,
                          const int16_t *iscan) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
-  __m256i v_eobmax = _mm256_set1_epi16(0);
+  __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
   (void)scan;
 
   load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
                      &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
-                     &v_quant_shift);
+                     &v_quant_shift, 0);
   // Do DC and first 15 AC.
   v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
                             &v_dequant, &v_round, &v_zbin, &v_quant_shift);
@@ -183,3 +193,104 @@ void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   *eob_ptr = accumulate_eob256(v_eobmax);
 }
+
+static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
+    __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
+    __m256i *v_quant_shift, __m256i *v_eobmax) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif
+    return *v_eobmax;
+  }
+  {
+    // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
+    const __m256i v_tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+    //  tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+    //                 quant_shift_ptr[rc != 0]) >> 15);
+    const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+    const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+    const __m256i v_tmp32_hi =
+        _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
+    const __m256i v_tmp32_lo =
+        _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
+    const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+    const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+    const __m256i v_sign_lo =
+        _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
+    const __m256i v_sign_hi =
+        _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
+    const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
+    const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
+    const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
+        _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
+    const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
+        _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
+    const __m256i v_nz_mask =
+        _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+
+    store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+    store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
+                            dqcoeff_ptr);
+#endif
+
+    return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
+  }
+}
+
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+  __m256i v_eobmax = _mm256_setzero_si256();
+  intptr_t count;
+  (void)n_coeffs;
+  (void)scan;
+
+  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+                     &v_quant_shift, 1);
+
+  // Do DC and first 15 AC.
+  v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+                                 &v_quant, &v_dequant, &v_round, &v_zbin,
+                                 &v_quant_shift, &v_eobmax);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (count = (32 * 32) - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+                                   &v_quant, &v_dequant, &v_round, &v_zbin,
+                                   &v_quant_shift, &v_eobmax);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}

From cc8610e18917c5ef654684a2ae7c3a5dee26640e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 19 Jul 2022 16:30:59 -0700
Subject: [PATCH 357/926] encode_api_test: quiet static analysis warning

in ConfigChangeThreadCount(); initialize cfg as the static analyzer can
assume AlwaysTrue() within EXPECT_NO_FATAL_FAILURE may return false
causing InitCodec() not to be called.

test/encode_api_test.cc|321 col 3| warning: 1st function call argument
is an uninitialized value [core.CallAndMessage]
  video.SetSize(cfg.g_w, cfg.g_h);

Bug: b/229626362
Change-Id: I54899ed0a207ca685416bed3a0e9c9644668e163
---
 test/encode_api_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 6f61c77502..08159148bd 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -336,7 +336,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) {
   for (const auto *iface : kCodecIfaces) {
     SCOPED_TRACE(vpx_codec_iface_name(iface));
     for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
-      vpx_codec_enc_cfg_t cfg;
+      vpx_codec_enc_cfg_t cfg = {};
       struct Encoder {
         ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
         vpx_codec_ctx_t ctx = {};

From a36d42f8bd5942de9b2ddf7855f6fac3369d5a7a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 19 Jul 2022 16:32:30 -0700
Subject: [PATCH 358/926] pp_filter_test: quiet static analysis warning

in CheckLowFilterOutput(); use std::unique_ptr to avoid spurious memory
leak warning:

test/pp_filter_test.cc|466 col 3| warning: Potential leak of memory
pointed to by 'expected_output' [cplusplus.NewDeleteLeaks]
  ASSERT_NE(expected_output, nullptr);

Bug: b/229626362
Change-Id: Ie9e06c9b9442ffa134e514d2aee70841d19c8ecb
---
 test/pp_filter_test.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 775f7f36a3..27d5ffa907 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,7 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include <limits.h>
+
+#include <memory>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
@@ -458,14 +462,13 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
 
   SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  unsigned char *expected_output = new unsigned char[rows_ * cols_];
+  std::unique_ptr<unsigned char[]> expected_output(
+      new unsigned char[rows_ * cols_]);
   ASSERT_NE(expected_output, nullptr);
-  SetRows(expected_output, rows_, cols_, cols_);
+  SetRows(expected_output.get(), rows_, cols_, cols_);
 
   RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0),
-                 expected_output);
-
-  delete[] expected_output;
+                 expected_output.get());
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {

From 59b27f758c137c2a2d4c11c9feb9a8875f58d096 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 19 Jul 2022 17:16:05 -0700
Subject: [PATCH 359/926] avg_intrin_avx2: rm dead store in highbd_hadamard_8x8

missed in:
53dd1e8e7 avg_intrin_{sse2,avg2}: rm dead store in hadamard_8x8

Change-Id: I378e4a388ceb193a4cfee4d9d317fc62fcc4b39e
---
 vpx_dsp/x86/avg_intrin_avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index d93e6ccae5..b2e01319d3 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -104,7 +104,7 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
   src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
   src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
-  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
 
   src32[0] = _mm256_cvtepi16_epi32(src16[0]);
   src32[1] = _mm256_cvtepi16_epi32(src16[1]);

From 4e504233f8e603abdd4b39395c8717668009a865 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Wed, 20 Jul 2022 15:35:33 -0700
Subject: [PATCH 360/926] L2E: Add more unit tests for GOP API

Add unit tests for a 4 frame video, which could be considered as a
corner case.

Three different GOP settings are tested and verified as valid.
(1). The first GOP has 3 coding frames, no alt ref.
     The second GOP has 1 coding frame, no alt ref.
     The numer of coding frames is 4.
     Their frame types are: keyframe, inter_frame, inter_frame,
     golden_frame.

(2). The first GOP has 4 coding frames, use alt ref.
     The second GOP has 1 coding frame, which is the overlay of
     the first GOP's alt ref frame.
     The numer of coding frames is 5.
     Their types are: keyframe, alt_ref, inter_frame, inter_frame,
     overlay_frame.

(3). Only one GOP with 4 coding frames, do not use alt ref.
     The numer of coding frames is 4.
     Their types are: keyframe, inter_frame, inter_frame, inter_frame.

Change-Id: I4079ff5065da79834b363b1e1976f65efed3f91f
---
 test/test.mk                  |   1 +
 test/vp9_ext_ratectrl_test.cc | 534 +++++++++++++++++++++++++++++++---
 2 files changed, 496 insertions(+), 39 deletions(-)

diff --git a/test/test.mk b/test/test.mk
index 6df4572904..f60d8f823f 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -59,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 68703b7e94..c954495dff 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -16,6 +16,7 @@
 #include "test/util.h"
 #include "test/yuv_video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vp9/simple_encode.h"
 #include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
@@ -25,6 +26,7 @@ constexpr int kModelMagicNumber = 51396;
 constexpr uintptr_t PrivMagicNumber = 5566;
 constexpr int kFrameNum = 5;
 constexpr int kFrameNumGOP = 30;
+constexpr int kFrameNumGOPShort = 4;
 constexpr int kLosslessCodingIndex = 2;
 constexpr int kFixedGOPSize = 9;
 // The range check in vp9_cx_iface.c shows that the max
@@ -38,6 +40,7 @@ constexpr int kDefaultMaxGfInterval = 16;
 // The numbers below are from manual inspection.
 constexpr int kReadMinGfInterval = 5;
 constexpr int kReadMaxGfInterval = 13;
+const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
 
 struct ToyRateCtrl {
   int magic_number;
@@ -50,12 +53,12 @@ struct ToyRateCtrl {
 
 vpx_rc_status_t rc_create_model(void *priv,
                                 const vpx_rc_config_t *ratectrl_config,
-                                vpx_rc_model_t *rate_ctrl_model_pt) {
+                                vpx_rc_model_t *rate_ctrl_model_ptr) {
   ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
   if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
   toy_rate_ctrl->magic_number = kModelMagicNumber;
   toy_rate_ctrl->coding_index = -1;
-  *rate_ctrl_model_pt = toy_rate_ctrl;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
   EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
   EXPECT_EQ(ratectrl_config->frame_width, 352);
   EXPECT_EQ(ratectrl_config->frame_height, 288);
@@ -68,7 +71,7 @@ vpx_rc_status_t rc_create_model(void *priv,
 
 vpx_rc_status_t rc_create_model_gop(void *priv,
                                     const vpx_rc_config_t *ratectrl_config,
-                                    vpx_rc_model_t *rate_ctrl_model_pt) {
+                                    vpx_rc_model_t *rate_ctrl_model_ptr) {
   ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
   if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
   toy_rate_ctrl->magic_number = kModelMagicNumber;
@@ -76,7 +79,7 @@ vpx_rc_status_t rc_create_model_gop(void *priv,
   toy_rate_ctrl->frames_since_key = 0;
   toy_rate_ctrl->show_index = 0;
   toy_rate_ctrl->coding_index = 0;
-  *rate_ctrl_model_pt = toy_rate_ctrl;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
   EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
   EXPECT_EQ(ratectrl_config->frame_width, 640);
   EXPECT_EQ(ratectrl_config->frame_height, 360);
@@ -87,6 +90,27 @@ vpx_rc_status_t rc_create_model_gop(void *priv,
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_create_model_gop_short(
+    void *priv, const vpx_rc_config_t *ratectrl_config,
+    vpx_rc_model_t *rate_ctrl_model_ptr) {
+  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+  toy_rate_ctrl->magic_number = kModelMagicNumber;
+  toy_rate_ctrl->gop_global_index = 0;
+  toy_rate_ctrl->frames_since_key = 0;
+  toy_rate_ctrl->show_index = 0;
+  toy_rate_ctrl->coding_index = 0;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
+  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+  EXPECT_EQ(ratectrl_config->frame_width, 352);
+  EXPECT_EQ(ratectrl_config->frame_height, 288);
+  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort);
+  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500);
+  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_send_firstpass_stats(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_firstpass_stats_t *first_pass_stats) {
@@ -113,6 +137,19 @@ vpx_rc_status_t rc_send_firstpass_stats_gop(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_send_firstpass_stats_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort);
+  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+  }
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_get_encodeframe_decision(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_info_t *encode_frame_info,
@@ -128,19 +165,17 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   if (encode_frame_info->coding_index == 0) {
     EXPECT_EQ(encode_frame_info->show_index, 0);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               0);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
               0);  // kRefFrameTypePast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
               0);  // kRefFrameTypeFuture
-  }
-
-  if (encode_frame_info->coding_index == 1) {
+  } else if (encode_frame_info->coding_index == 1) {
     EXPECT_EQ(encode_frame_info->show_index, 4);
     EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               1);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
@@ -149,19 +184,15 @@ vpx_rc_status_t rc_get_encodeframe_decision(
               0);  // kRefFrameTypeFuture
     EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
               0);  // kRefFrameTypeLast
-  }
-
-  if (encode_frame_info->coding_index >= 2 &&
-      encode_frame_info->coding_index < 5) {
+  } else if (encode_frame_info->coding_index >= 2 &&
+             encode_frame_info->coding_index < 5) {
     // In the first group of pictures, coding_index and gop_index are equal.
     EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
-    EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/);
-  }
-
-  if (encode_frame_info->coding_index == 5) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+  } else if (encode_frame_info->coding_index == 5) {
     EXPECT_EQ(encode_frame_info->show_index, 4);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               1);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
@@ -197,19 +228,17 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop(
   if (encode_frame_info->coding_index == 0) {
     EXPECT_EQ(encode_frame_info->show_index, 0);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               0);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
               0);  // kRefFrameTypePast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
               0);  // kRefFrameTypeFuture
-  }
-
-  if (encode_frame_info->coding_index == 1) {
+  } else if (encode_frame_info->coding_index == 1) {
     EXPECT_EQ(encode_frame_info->show_index, 1);
     EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               1);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
@@ -218,36 +247,198 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop(
               0);  // kRefFrameTypeFuture
     EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
               0);  // kRefFrameTypeLast
-  }
-
-  if (encode_frame_info->coding_index == 2) {
+  } else if (encode_frame_info->coding_index == 2) {
     EXPECT_EQ(encode_frame_info->show_index, 2);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               0);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
               0);  // kRefFrameTypePast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
               0);  // kRefFrameTypeFuture
+  } else if (encode_frame_info->coding_index == 3 ||
+             encode_frame_info->coding_index == 12 ||
+             encode_frame_info->coding_index == 21) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+  } else if (encode_frame_info->coding_index == 11 ||
+             encode_frame_info->coding_index == 20 ||
+             encode_frame_info->coding_index == 29) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+  } else if (encode_frame_info->coding_index >= 30) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
   }
 
-  if (encode_frame_info->coding_index == 3 ||
-      encode_frame_info->coding_index == 12 ||
-      encode_frame_info->coding_index == 21) {
-    EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/);
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
     EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
   }
 
-  if (encode_frame_info->coding_index == 11 ||
-      encode_frame_info->coding_index == 20 ||
-      encode_frame_info->coding_index == 29) {
-    EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/);
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 3);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 4) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
   }
 
-  if (encode_frame_info->coding_index >= 30) {
-    EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/);
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 3);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
   }
 
   // When the model recommends an invalid q, valid range [0, 255],
@@ -296,6 +487,117 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
   return VPX_RC_OK;
 }
 
+// Test on a 4 frame video.
+// Test a setting of 2 GOPs.
+// The first GOP has 3 coding frames, no alt ref.
+// The second GOP has 1 coding frame, no alt ref.
+vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
+                                          const vpx_rc_gop_info_t *gop_info,
+                                          vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1;
+  gop_decision->use_alt_ref = 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 2 GOPs.
+// The first GOP has 4 coding frames. Use alt ref.
+// The second GOP only contains the overlay frame of the first GOP's alt ref
+// frame.
+vpx_rc_status_t rc_get_gop_decision_short_overlay(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
+  gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 1 GOP.
+// The GOP has 4 coding frames. Do not use alt ref.
+vpx_rc_status_t rc_get_gop_decision_short_no_arf(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
+  gop_decision->use_alt_ref = 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_update_encodeframe_result(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_result_t *encode_frame_result) {
@@ -328,6 +630,18 @@ vpx_rc_status_t rc_update_encodeframe_result_gop(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_update_encodeframe_result_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+  const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
+  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
@@ -371,7 +685,7 @@ TEST_F(ExtRateCtrlTest, EncodeTest) {
       "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
       kFrameNum));
 
-  ASSERT_NE(video.get(), nullptr);
+  ASSERT_NE(video, nullptr);
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
@@ -417,7 +731,149 @@ TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
       "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
       kFrameNumGOP));
 
-  ASSERT_NE(video.get(), nullptr);
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
+                                public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShort() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShortOverlay
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShortOverlay() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision =
+          rc_get_encodeframe_decision_gop_short_overlay;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShortNoARF
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShortNoARF() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision =
+          rc_get_encodeframe_decision_gop_short_no_arf;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 

From 90c5493ff5d805676233252be633d2eedd5ceb50 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 20 Jul 2022 09:51:56 -0700
Subject: [PATCH 361/926] VPX: Add vpx_highbd_quantize_b_avx2().

Up to 3.61x faster than vpx_highbd_quantize_b_sse2() for full
calculations.

~2.3% overall encoder improvement for the test clip used.

Bug: b/237714063
Change-Id: I23f88d2a7f96aaa4103778372f4f552207f73cee
---
 test/vp9_quantize_test.cc                 |  12 ++
 vpx_dsp/vpx_dsp.mk                        |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |   2 +-
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 169 ++++++++++++++++++++++
 4 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 vpx_dsp/x86/highbd_quantize_intrin_avx2.c

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 3e0dd77396..ac33d17707 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -574,6 +574,17 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
     ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
@@ -584,6 +595,7 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&vpx_quantize_b_32x32_avx2,
                                  &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
                                  false)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 8e0100c310..dd667195f5 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -332,6 +332,7 @@ DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/quantize_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_quantize_intrin_avx2.c
 endif
 
 # avg
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index beb594f9f3..45fcb559f1 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -718,7 +718,7 @@ ()
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2/;
+    specialize qw/vpx_highbd_quantize_b sse2 avx2/;
 
     add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 0000000000..f4c288cc09
--- /dev/null
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i sign = _mm_srai_epi16(*p, 15);
+  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
+  int i;
+  for (i = 0; i < 5; ++i) {
+    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+  }
+}
+
+static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
+                                     const int16_t *round_ptr,
+                                     const int16_t *quant_ptr,
+                                     const int16_t *dequant_ptr,
+                                     const int16_t *quant_shift_ptr,
+                                     __m256i *qp, int log_scale) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+  init_one_qp(&zbin, &qp[0]);
+  init_one_qp(&round, &qp[1]);
+  init_one_qp(&quant, &qp[2]);
+  init_one_qp(&dequant, &qp[3]);
+  init_one_qp(&quant_shift, &qp[4]);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+    qp[0] = _mm256_add_epi32(qp[0], rnd);
+    qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+    qp[1] = _mm256_add_epi32(qp[1], rnd);
+    qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16.  The output, 16 int32_t is save in *p.
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+                                                      const __m256i *y) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+                                                 __m256i eobmax,
+                                                 __m256i nz_mask) {
+  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
+  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+  __m256i eob_s;
+  eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+}
+
+static VPX_FORCE_INLINE void quantize(const __m256i *qp,
+                                      const tran_low_t *coeff_ptr,
+                                      const int16_t *iscan_ptr,
+                                      tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                      __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (_mm256_movemask_epi8(zbin_mask) == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+  {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+    const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+    const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+    const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+    const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+    const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+    const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  }
+}
+
+void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const int step = 8;
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+  (void)scan;
+
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+  quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}

From 90ef3906a2af36a0f973709302785de92a7e12a1 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 25 Jul 2022 05:52:27 -0700
Subject: [PATCH 362/926] VPX: Add vpx_highbd_quantize_b_32x32_avx2().

Up to 11.78x faster than vpx_quantize_b_32x32_sse2() for full
calculations.

~1.7% overall encoder improvement for the test clip used.

Bug: b/237714063

Change-Id: Ib759056db94d3487239cb2748ffef1184a89ae18
---
 test/vp9_quantize_test.cc                 |  8 ++-
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  2 +-
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 86 +++++++++++++++++++++++
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index ac33d17707..705f3f0077 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -583,7 +583,13 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_10, 16, false),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false)));
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 45fcb559f1..e7ad640af8 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -721,7 +721,7 @@ ()
     specialize qw/vpx_highbd_quantize_b sse2 avx2/;
 
     add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+    specialize qw/vpx_highbd_quantize_b_32x32 sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index f4c288cc09..ec1110ff8c 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -167,3 +167,89 @@ void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   *eob_ptr = get_max_eob(eob);
 }
+
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE void quantize_b_32x32(
+    const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (_mm256_movemask_epi8(zbin_mask) == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+
+  {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+    // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+    const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+    // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1);
+    const __m256i abs_dq =
+        _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1);
+    const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+    const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  }
+}
+
+void vpx_highbd_quantize_b_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const unsigned int step = 8;
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+  (void)scan;
+
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+
+  quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
\ No newline at end of file

From ea13f315c9e98a908761fb29042728857d070216 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Jul 2022 18:17:22 -0700
Subject: [PATCH 363/926] highbd_temporal_filter_sse4: remove unused function
 params

this clears warnings under clang-13 of the form:
vp9/encoder/x86/highbd_temporal_filter_sse4.c|196 col 63| warning:
parameter 'v_pre' set but not used [-Wunused-but-set-parameter]

this is the high-bitdepth version of:
73b8aade8 temporal_filter_sse4: remove unused function params

Change-Id: I9b2c9bf27c16975e4855df6a2c967da4c8c63a3a
---
 vp9/encoder/x86/highbd_temporal_filter_sse4.c | 135 ++++++------------
 1 file changed, 42 insertions(+), 93 deletions(-)

diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
index 4fa24512c5..a7f5117cff 100644
--- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c
+++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -191,13 +191,11 @@ static INLINE void highbd_read_chroma_dist_row_8(
 }
 
 static void vp9_highbd_apply_temporal_filter_luma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
-    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
     const uint32_t *const *neighbors_second, int top_weight,
     int bottom_weight) {
   const int rounding = (1 << strength) >> 1;
@@ -256,17 +254,12 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
   highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
                                 y_accum);
 
-  y_src += y_src_stride;
   y_pre += y_pre_stride;
   y_count += y_pre_stride;
   y_accum += y_pre_stride;
   y_dist += DIST_STRIDE;
 
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
 
   // Then all the rows except the last one
@@ -300,11 +293,7 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
       highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
                                     &v_first, &v_second);
 
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
       u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
       v_dist += DIST_STRIDE;
     }
 
@@ -320,7 +309,6 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
     highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
                                   y_accum);
 
-    y_src += y_src_stride;
     y_pre += y_pre_stride;
     y_count += y_pre_stride;
     y_accum += y_pre_stride;
@@ -364,13 +352,10 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
 
 // Perform temporal filter for the luma component.
 static void vp9_highbd_apply_temporal_filter_luma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
-    const uint32_t *u_dist, const uint32_t *v_dist) {
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
   unsigned int blk_col = 0, uv_blk_col = 0;
   const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
   const unsigned int mid_width = block_width >> 1,
@@ -384,9 +369,7 @@ static void vp9_highbd_apply_temporal_filter_luma(
   neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
   neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
   vp9_highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
       strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
       y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
       neighbors_first, neighbors_second, top_weight, bottom_weight);
@@ -399,13 +382,10 @@ static void vp9_highbd_apply_temporal_filter_luma(
   for (; blk_col < mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
   }
 
   if (!use_whole_blk) {
@@ -417,21 +397,16 @@ static void vp9_highbd_apply_temporal_filter_luma(
   for (; blk_col < last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
   }
 
   // Right
   neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
   vp9_highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
       strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
       y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
       neighbors_first, neighbors_second, top_weight, bottom_weight);
@@ -491,13 +466,11 @@ static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
 // blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_highbd_apply_temporal_filter_chroma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
     const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
     int top_weight, int bottom_weight, const int *blk_fw) {
   const int rounding = (1 << strength) >> 1;
@@ -565,10 +538,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
   highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
                                 v_accum);
 
-  u_src += uv_src_stride;
   u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
   v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
   u_count += uv_pre_stride;
@@ -576,8 +547,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
   v_count += uv_pre_stride;
   v_accum += uv_pre_stride;
 
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
   y_dist += DIST_STRIDE * (1 + ss_y);
 
   // Then all the rows except the last one
@@ -649,10 +618,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
     highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
                                   v_accum);
 
-    u_src += uv_src_stride;
     u_pre += uv_pre_stride;
     u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
     v_pre += uv_pre_stride;
     v_dist += DIST_STRIDE;
     u_count += uv_pre_stride;
@@ -660,8 +627,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
     v_count += uv_pre_stride;
     v_accum += uv_pre_stride;
 
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
     y_dist += DIST_STRIDE * (1 + ss_y);
   }
 
@@ -720,12 +685,10 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
 
 // Perform temporal filter for the chroma components.
 static void vp9_highbd_apply_temporal_filter_chroma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
   const unsigned int uv_width = block_width >> ss_x,
                      uv_height = block_height >> ss_y;
@@ -755,8 +718,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
 
     if (use_whole_blk) {
       vp9_highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
           uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
           u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -764,8 +725,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
           neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
     } else {
       vp9_highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
           uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
           u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -789,13 +748,11 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   }
 
   vp9_highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
 
   blk_col += blk_col_step;
   uv_blk_col += uv_blk_col_step;
@@ -812,8 +769,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
         uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
         u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -830,8 +785,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
         uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
         u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -849,13 +802,11 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   }
 
   vp9_highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
 }
 
 void vp9_highbd_apply_temporal_filter_sse4_1(
@@ -929,14 +880,12 @@ void vp9_highbd_apply_temporal_filter_sse4_1(
   u_dist_ptr = u_dist + 1;
   v_dist_ptr = v_dist + 1;
 
-  vp9_highbd_apply_temporal_filter_luma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw, use_whole_blk, y_accum, y_count, y_dist_ptr, u_dist_ptr,
-      v_dist_ptr);
+  vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                        block_height, ss_x, ss_y, strength,
+                                        blk_fw, use_whole_blk, y_accum, y_count,
+                                        y_dist_ptr, u_dist_ptr, v_dist_ptr);
 
   vp9_highbd_apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
       u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
       strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
       y_dist_ptr, u_dist_ptr, v_dist_ptr);

From ce484db2110fc0a81f9d4e6c1a09d132e1e2b2b7 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 25 Jul 2022 10:58:44 -0700
Subject: [PATCH 364/926] VPX: vp9_quantize_fp_avx2() cleanup.

No change in performance.

Bug: b/237714063

Change-Id: I8ea42759cc4dc57be6a29c23784997cb90ad4090
---
 test/vp9_quantize_test.cc           |   2 +
 vp9/encoder/x86/vp9_quantize_avx2.c | 164 ++++++++++++++--------------
 2 files changed, 86 insertions(+), 80 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 705f3f0077..97998eb08b 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -578,6 +578,8 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
     ::testing::Values(
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index db18b1a7a4..5d02f4fe85 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -18,7 +18,7 @@
 #include "vpx_dsp/x86/quantize_sse2.h"
 
 // Zero fill 8 positions in the output buffer.
-static INLINE void store_zero_tran_low(tran_low_t *a) {
+static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
   const __m256i zero = _mm256_setzero_si256();
 #if CONFIG_VP9_HIGHBITDEPTH
   _mm256_storeu_si256((__m256i *)(a), zero);
@@ -28,22 +28,72 @@ static INLINE void store_zero_tran_low(tran_low_t *a) {
 #endif
 }
 
-static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
-                                   __m256i *coeff256) {
-  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
-  const __m256i zero256 = _mm256_setzero_si256();
+static VPX_FORCE_INLINE void load_fp_values_avx2(
+    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
-  // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
-  // B1 A1 B0 A0.  Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
-  const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
-  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
+  // typedef int32_t tran_low_t;
+  const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
+  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
 #else
-  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+  // typedef int16_t tran_low_t;
+  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask);
 #endif
-  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
-  // Add one to convert from indices to counts
-  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
-  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) {
+  const __m256i eob_lo = eob256;
+  // Copy upper 128 to lower 128
+  const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81);
+  __m256i eob = _mm256_max_epi16(eob_lo, eob_hi);
+  __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+}
+
+static VPX_FORCE_INLINE void quantize_fp_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const int32_t nzflag =
+      _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr));
+
+  if (nzflag) {
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
 }
 
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -51,10 +101,8 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
-  __m128i eob;
-  __m256i round256, quant256, dequant256;
-  __m256i eob256, thr256;
-
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
   (void)scan;
 
   coeff_ptr += n_coeffs;
@@ -63,74 +111,30 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
 
-  {
-    __m256i coeff256;
-
-    // Setup global values
-    {
-      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
-      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
-      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      round256 = _mm256_castsi128_si256(round);
-      round256 = _mm256_permute4x64_epi64(round256, 0x54);
-
-      quant256 = _mm256_castsi128_si256(quant);
-      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
-
-      dequant256 = _mm256_castsi128_si256(dequant);
-      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
-    }
-
-    {
-      __m256i qcoeff256;
-      __m256i qtmp256;
-      coeff256 = load_tran_low(coeff_ptr + n_coeffs);
-      qcoeff256 = _mm256_abs_epi16(coeff256);
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
-    }
-
-    eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256);
-    n_coeffs += 8 * 2;
-  }
+  // Setup global values
+  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                      &dequant);
+  thr = _mm256_setzero_si256();
 
-  // remove dc constants
-  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
-  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
-  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+  quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                 iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                 dqcoeff_ptr + n_coeffs, &eob_max);
 
-  thr256 = _mm256_srai_epi16(dequant256, 1);
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_srai_epi16(dequant, 1);
 
   // AC only loop
   while (n_coeffs < 0) {
-    __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
-    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
-    int32_t nzflag =
-        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
-
-    if (nzflag) {
-      __m256i qtmp256;
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
-      eob256 = _mm256_max_epi16(
-          eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256));
-    } else {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-    }
+    quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                   iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                   dqcoeff_ptr + n_coeffs, &eob_max);
     n_coeffs += 8 * 2;
   }
 
-  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
-                      _mm256_extracti128_si256(eob256, 1));
-
-  *eob_ptr = accumulate_eob(eob);
+  *eob_ptr = get_max_eob(eob_max);
 }

From 1c0c4d51b474585d05b36d2d70af6b20f507c931 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Jul 2022 19:08:40 -0700
Subject: [PATCH 365/926] y4m_input_fetch_frame: fix ubsan null/zero offset
 warning

reported under clang-13; use a while loop in file_read() to force a size
check before attempting to read. buf (aux_buf) may be may be null when
no conversion is necessary.

y4minput.c:29:43: runtime error: applying zero offset to null pointer

Bug: b/229626362
Change-Id: Ia3250d6ff9c325faf48eaa31f4399e20837f8f7b
---
 y4minput.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/y4minput.c b/y4minput.c
index 7d3c03a7fc..745e2f1cd6 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -21,12 +21,13 @@
 // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
 // Returns true on success.
 static int file_read(void *buf, size_t size, FILE *file) {
-  const int kMaxRetries = 5;
-  int retry_count = 0;
-  int file_error;
+  const int kMaxTries = 5;
+  int try_count = 0;
+  int file_error = 0;
   size_t len = 0;
-  do {
+  while (!feof(file) && len < size && try_count < kMaxTries) {
     const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
+    ++try_count;
     len += n;
     file_error = ferror(file);
     if (file_error) {
@@ -39,13 +40,13 @@ static int file_read(void *buf, size_t size, FILE *file) {
         return 0;
       }
     }
-  } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
+  }
 
   if (!feof(file) && len != size) {
     fprintf(stderr,
             "Error reading file: %u of %u bytes read,"
-            " error: %d, retries: %d, %d: %s\n",
-            (uint32_t)len, (uint32_t)size, file_error, retry_count, errno,
+            " error: %d, tries: %d, %d: %s\n",
+            (uint32_t)len, (uint32_t)size, file_error, try_count, errno,
             strerror(errno));
   }
   return len == size;

From ed78231aa54c131018c0c9415cf416beac97a698 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 26 Jul 2022 19:26:23 -0700
Subject: [PATCH 366/926] vp9,decoder_decode: fix ubsan null/zero offset
 warning

reported under clang-13. null data may be passed as a flush; move
data_end after that check

vp9/vp9_dx_iface.c:337:40: runtime error: applying zero offset to null
pointer

Bug: b/229626362
Change-Id: I845726fd6eb6ac7a776e49272c6477a5ad30ffdf
---
 vp9/vp9_dx_iface.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 3c42c7dfed..bdfe217936 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -334,7 +334,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
   const uint8_t *data_start = data;
-  const uint8_t *const data_end = data + data_sz;
   vpx_codec_err_t res;
   uint32_t frame_sizes[8];
   int frame_count;
@@ -362,6 +361,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
 
   // Decode in serial mode.
   if (frame_count > 0) {
+    const uint8_t *const data_end = data + data_sz;
     int i;
 
     for (i = 0; i < frame_count; ++i) {
@@ -379,6 +379,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
       data_start += frame_size;
     }
   } else {
+    const uint8_t *const data_end = data + data_sz;
     while (data_start < data_end) {
       const uint32_t frame_size = (uint32_t)(data_end - data_start);
       const vpx_codec_err_t res =

From 7dab508cd9c8fc8f0ac44bfb380c31e25919d883 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 27 Jul 2022 13:29:41 -0700
Subject: [PATCH 367/926] encode_test_driver: normalize frame_flags type

use vpx_enc_frame_flags_t; this avoids int -> unsigned conversion
warnings; reported w/clang -fsanitize=integer:

test/error_resilience_test.cc:95:9: runtime error: implicit conversion
from type 'int' of value -12845057 (32-bit, signed) to type 'unsigned
long' changed the value to 4282122239 (32-bit, unsigned)

Bug: b/229626362
Change-Id: I0fc1dbe44a258f397cf1a05347d8cb86ee70b1b8
---
 test/encode_test_driver.cc | 5 +++--
 test/encode_test_driver.h  | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 9ca15ae4d3..d3feeee34d 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -52,7 +52,8 @@ void Encoder::InitEncoder(VideoSource *video) {
   }
 }
 
-void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+void Encoder::EncodeFrame(VideoSource *video,
+                          const vpx_enc_frame_flags_t frame_flags) {
   if (video->img()) {
     EncodeFrameInternal(*video, frame_flags);
   } else {
@@ -70,7 +71,7 @@ void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
 }
 
 void Encoder::EncodeFrameInternal(const VideoSource &video,
-                                  const unsigned long frame_flags) {
+                                  const vpx_enc_frame_flags_t frame_flags) {
   vpx_codec_err_t res;
   const vpx_image_t *img = video.img();
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index f6bb841d8c..b57df85291 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -103,7 +103,7 @@ class Encoder {
   }
   // This is a thin wrapper around vpx_codec_encode(), so refer to
   // vpx_encoder.h for its semantics.
-  void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+  void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags);
 
   // Convenience wrapper for EncodeFrame()
   void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
@@ -184,7 +184,7 @@ class Encoder {
 
   // Encode an image
   void EncodeFrameInternal(const VideoSource &video,
-                           const unsigned long frame_flags);
+                           vpx_enc_frame_flags_t frame_flags);
 
   // Flush the encoder on EOS
   void Flush();
@@ -289,7 +289,7 @@ class EncoderTest {
   unsigned long deadline_;
   TwopassStatsStore stats_;
   unsigned long init_flags_;
-  unsigned long frame_flags_;
+  vpx_enc_frame_flags_t frame_flags_;
 };
 
 }  // namespace libvpx_test

From 9763f3c549569df3bad776a02537a97b2d203a3c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 27 Jul 2022 15:15:25 -0700
Subject: [PATCH 368/926] vp8_find_near_mvs: fix implicit conversion warnings

unsigned -> int and vice versa

reported by clang -fsanitize=integer

vp8/common/findnearmv.c:108:11: runtime error: implicit conversion from
type 'uint32_t' (aka 'unsigned int') of value 4294443008 (32-bit,
unsigned) to type 'int' changed the value to -524288 (32-bit, signed)
vp8/common/findnearmv.c:110:33: runtime error: implicit conversion from
type 'int' of value -524288 (32-bit, signed) to type 'uint32_t' (aka
'unsigned int') changed the value to 4294443008 (32-bit, unsigned)

Bug: b/229626362
Change-Id: Ic7ce0fd98255ccf9307ac73e9fb6a8189b268214
---
 vp8/common/findnearmv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index 6889fdedde..3b31923621 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -105,9 +105,9 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
     tmp = near_mv_ref_cnts[CNT_NEAREST];
     near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR];
     near_mv_ref_cnts[CNT_NEAR] = tmp;
-    tmp = near_mvs[CNT_NEAREST].as_int;
+    tmp = (int)near_mvs[CNT_NEAREST].as_int;
     near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-    near_mvs[CNT_NEAR].as_int = tmp;
+    near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
   }
 
   /* Use near_mvs[0] to store the "best" MV */

From b6d06a6e268d945b960660a91e77195c2612642c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 27 Jul 2022 15:31:00 -0700
Subject: [PATCH 369/926] vp8,read_mb_modes_mv: fix implicit conversion
 warnings

w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'uint32_t' (aka 'unsigned int') of value
4294443008 (32-bit, unsigned) to type 'int' changed the value to -524288
(32-bit, signed)

Bug: b/229626362
Change-Id: Ic7c0a2e7b64a1dd6fd5cc64adcd5765318c2a956
---
 vp8/decoder/decodemv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 51817a2cb9..3f459d623f 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -372,9 +372,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
         tmp = cnt[CNT_NEAREST];
         cnt[CNT_NEAREST] = cnt[CNT_NEAR];
         cnt[CNT_NEAR] = tmp;
-        tmp = near_mvs[CNT_NEAREST].as_int;
+        tmp = (int)near_mvs[CNT_NEAREST].as_int;
         near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-        near_mvs[CNT_NEAR].as_int = tmp;
+        near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
       }
 
       if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {

From aecf7ba51af515545d9f26ea9c66f347d6c40814 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 27 Jul 2022 15:41:00 -0700
Subject: [PATCH 370/926] variance_avx2.c: fix implicit conversion warnings

w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'int' of value -1323 (32-bit, signed) to
type 'unsigned int' changed the value to 4294965973 (32-bit, unsigned)

Bug: b/229626362
Change-Id: I7291d9bd5cacea0d88d9f4c4624c096764f4a472
---
 vpx_dsp/x86/variance_avx2.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 9232acbfbb..35925d5908 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -590,17 +590,20 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
   return sum;
 }
 
-static unsigned int sub_pixel_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, int height, unsigned int *sse) {
+static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                       int x_offset, int y_offset,
+                                       const uint8_t *dst, int dst_stride,
+                                       int height, unsigned int *sse) {
   return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
                          NULL, 0, 0, height, sse);
 }
 
-static unsigned int sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *second_pred,
-    int second_stride, int height, unsigned int *sse) {
+static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                           int x_offset, int y_offset,
+                                           const uint8_t *dst, int dst_stride,
+                                           const uint8_t *second_pred,
+                                           int second_stride, int height,
+                                           unsigned int *sse) {
   return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
                          second_pred, second_stride, 1, height, sse);
 }

From e533d989ea93ee6cd980cc07e077475733694687 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 27 Jul 2022 15:48:24 -0700
Subject: [PATCH 371/926] vp9_filter_block_plane_non420: fix implicit
 conversion warnings

w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'int' of value -2 (32-bit, signed) to type
'unsigned int' changed the value to 4294967294 (32-bit, unsigned)

Bug: b/229626362
Change-Id: Id7e13b3d494ccd1a2351db8fab6fdb6a9a771d51
---
 vp9/common/vp9_loopfilter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 95d6029f3b..765cb11726 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1180,7 +1180,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
     }
 
     // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0 ? 1 : 0);
+    border_mask = ~(mi_col == 0 ? 1u : 0u);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_vert(

From 4667992d8ba51d60045d6fed705635a6455eb4f8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 27 Jul 2022 15:22:37 -0700
Subject: [PATCH 372/926] x86: normalize type with _mm_cvtsi128_si32

prefer int in most cases

w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'int' of value -809931979 (32-bit, signed)
to type 'uint32_t' (aka 'unsigned int') changed the value to 3485035317
(32-bit, unsigned)

Bug: b/229626362
Change-Id: I0c6604efc188f2660c531eddfc7aa10060637813
---
 vp8/encoder/x86/denoising_sse2.c           |  2 +-
 vpx_dsp/x86/avg_intrin_sse2.c              |  2 +-
 vpx_dsp/x86/convolve_avx2.h                |  5 ++---
 vpx_dsp/x86/mem_sse2.h                     |  2 +-
 vpx_dsp/x86/sad_avx2.c                     | 15 ++++++---------
 vpx_dsp/x86/variance_sse2.c                |  2 +-
 vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c  |  6 +++---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c  |  2 +-
 vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c |  6 +++---
 9 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index 89cad53356..f35b930169 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -30,7 +30,7 @@ static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
       _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
   const __m128i hgfedcba =
       _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
-  unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
+  unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba));
 
   return sum_diff;
 }
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 0c4919f6d8..015c11a1f3 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -164,7 +164,7 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
   s0 = _mm_add_epi32(s0, s1);
   s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
   s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
-  avg = _mm_cvtsi128_si32(s0);
+  avg = (unsigned int)_mm_cvtsi128_si32(s0);
 
   return (avg + 32) >> 6;
 }
diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h
index 99bc9637fc..ebee964b18 100644
--- a/vpx_dsp/x86/convolve_avx2.h
+++ b/vpx_dsp/x86/convolve_avx2.h
@@ -129,9 +129,8 @@ static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
 static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
                                        __m128i *const dst_ptr_2,
                                        const __m256i *const src) {
-  *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
-  *((uint32_t *)(dst_ptr_2)) =
-      _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+  *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+  *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
 }
 
 static INLINE __m256i mm256_round_epi32(const __m256i *const src,
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 8b6d4d1dd4..6df4773f73 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -33,7 +33,7 @@ static INLINE __m128i load_unaligned_u32(const void *a) {
 }
 
 static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
-  const uint32_t val = _mm_cvtsi128_si32(v);
+  const int val = _mm_cvtsi128_si32(v);
   memcpy(a, &val, sizeof(val));
 }
 
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index 3b48acd510..29bedb0e6e 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -14,7 +14,7 @@
 #define FSAD64_H(h)                                                           \
   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -35,8 +35,7 @@
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
 #define FSAD32_H(h)                                                           \
@@ -92,7 +91,7 @@ FSAD32
   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -118,15 +117,14 @@ FSAD32
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
 #define FSADAVG32_H(h)                                                        \
   unsigned int vpx_sad32x##h##_avg_avx2(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -156,8 +154,7 @@ FSAD32
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
 #define FSADAVG64 \
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index a67c92aadb..fedc8b84e5 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -19,7 +19,7 @@
 static INLINE unsigned int add32x4_sse2(__m128i val) {
   val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
   val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
-  return _mm_cvtsi128_si32(val);
+  return (unsigned int)_mm_cvtsi128_si32(val);
 }
 
 unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 0cbd151dc3..21a35ae3c3 100644
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -485,7 +485,7 @@ static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
     // Saturate and convert to 8-bit words
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
 
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
 
     src_ptr += src_stride;
     dst_ptr += dst_stride;
@@ -589,8 +589,8 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
     res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
 
     // Save only half of the register (8 words)
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
-    *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+    *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
 
     // Update the source by two rows
     src_ptr += src_stride_unrolled;
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 6f2983a4b5..db3c39de0f 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -798,7 +798,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
 
     // Pack to 8-bits
     dst = _mm_packus_epi16(dst, _mm_setzero_si128());
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
   }
 }
 
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index ed46d6245d..4ea2752d38 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -580,7 +580,7 @@ static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
 
     // Pack to 8-bits
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
 
     src_ptr += src_stride;
     dst_ptr += dst_stride;
@@ -666,8 +666,8 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
     reg_1 = _mm_packus_epi16(reg_1, reg_1);
 
     // Save the result
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
-    *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+    *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
 
     // Update the source by two rows
     src_ptr += src_stride_unrolled;

From 1ce49998f751c711325d9fdc93d37d61a73465d5 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 27 Jul 2022 18:56:22 -0700
Subject: [PATCH 373/926] vp9_active_[hv]_edge: add missing
 vpx_clear_system_state

this fixes runtime errors with clang -fsanitize=integer in x86 builds:

../vp9/encoder/vp9_rdopt.c:3250:17: runtime error: signed integer
  overflow: 18 - -2147483648 cannot be represented in type 'int'
../vp9/encoder/vp9_rdopt.c:3277:16: runtime error: signed integer
  overflow: 26 - -2147483648 cannot be represented in type 'int'

Bug: b/229626362
Change-Id: Ic9a5063c840b4fce7056f61362234721add056a6
---
 vp9/encoder/vp9_rdopt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3b574ef172..bfde5ab1a5 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3242,6 +3242,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
   // For two pass account for any formatting bars detected.
   if (cpi->oxcf.pass == 2) {
     TWO_PASS *twopass = &cpi->twopass;
+    vpx_clear_system_state();
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
@@ -3269,6 +3270,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
   // For two pass account for any formatting bars detected.
   if (cpi->oxcf.pass == 2) {
     TWO_PASS *twopass = &cpi->twopass;
+    vpx_clear_system_state();
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.

From b3536cfafe7c56fa24bfec35e9cacc2082065bd2 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Fri, 29 Jul 2022 09:40:53 +0000
Subject: [PATCH 374/926] Provide Arm SDOT optimizations for SAD functions

Change-Id: I497ee1c45d1fc4d643cefad7d87e5aaacd77869c
---
 vpx_dsp/arm/sad_neon.c | 306 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)

diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index b1509d883a..34870375a3 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -21,9 +21,16 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride) {
   const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+  const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
+  const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
+  return horizontal_add_uint32x4(dp);
+#else
   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
   return horizontal_add_uint16x8(abs);
+#endif
 }
 
 uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
@@ -33,13 +40,36 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
   const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
   const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+  const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
+  const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
+  return horizontal_add_uint32x4(prod);
+#else
   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
   return horizontal_add_uint16x8(abs);
+#endif
 }
 
 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride) {
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
+  const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+  const uint8x16_t src2_u8 =
+      load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
+  const uint8x16_t ref2_u8 =
+      load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
+  const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8);
+  const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8);
+  prod = vdotq_u32(prod, sad1_u8, ones);
+  prod = vdotq_u32(prod, sad2_u8, ones);
+  return horizontal_add_uint32x4(prod);
+#else
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
   for (i = 0; i < 8; i += 4) {
@@ -52,11 +82,32 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
   }
 
   return horizontal_add_uint16x8(abs);
+#endif
 }
 
 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              const uint8_t *second_pred) {
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
+  const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+  const uint8x16_t src2_u8 =
+      load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
+  const uint8x16_t ref2_u8 =
+      load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
+  const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred);
+  const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16);
+  const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8);
+  const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8);
+  const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1);
+  const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2);
+  prod = vdotq_u32(prod, sad1_u8, ones);
+  prod = vdotq_u32(prod, sad2_u8, ones);
+  return horizontal_add_uint32x4(prod);
+#else
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
   for (i = 0; i < 8; i += 4) {
@@ -72,8 +123,66 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
   }
 
   return horizontal_add_uint16x8(abs);
+#endif
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               const int height) {
+  int i;
+  uint32x2_t prod = vdup_n_u32(0);
+  const uint8x8_t ones = vdup_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    prod = vdot_u32(prod, sad_u8, ones);
+  }
+  return prod;
+}
+
+static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   const uint8_t *second_pred,
+                                   const int height) {
+  int i;
+  uint32x2_t prod = vdup_n_u32(0);
+  const uint8x8_t ones = vdup_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    const uint8x8_t c_u8 = vld1_u8(second_pred);
+    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+    const uint8x8_t sad_u8 = vabd_u8(a_u8, avg);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
+    prod = vdot_u32(prod, sad_u8, ones);
+  }
+  return prod;
+}
+
+#define SAD8XN(n)                                                            \
+  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                               const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x2_t prod =                                                  \
+        sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return horizontal_add_uint32x2(prod);                                    \
+  }                                                                          \
+                                                                             \
+  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                   const uint8_t *ref_ptr, int ref_stride,   \
+                                   const uint8_t *second_pred) {             \
+    const uint32x2_t prod =                                                  \
+        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return horizontal_add_uint32x2(prod);                                    \
+  }
+
+#else
 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
                                const uint8_t *ref_ptr, int ref_stride,
                                const int height) {
@@ -124,11 +233,68 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
         sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
     return horizontal_add_uint16x8(abs);                                       \
   }
+#endif
 
 SAD8XN(4)
 SAD8XN(8)
 SAD8XN(16)
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t src_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t ref_u8 = vld1q_u8(ref_ptr);
+    const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    prod = vdotq_u32(prod, sad_u8, ones);
+  }
+  return prod;
+}
+
+static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+    const uint8x16_t c_u8 = vld1q_u8(second_pred);
+    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
+    const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+    prod = vdotq_u32(prod, sad_u8, ones);
+  }
+  return prod;
+}
+
+#define SAD16XN(n)                                                            \
+  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x4_t prod =                                                   \
+        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return horizontal_add_uint32x4(prod);                                     \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint32x4_t prod =                                                   \
+        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return horizontal_add_uint32x4(prod);                                     \
+  }
+#else
 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -182,11 +348,79 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint16x8(abs);                                      \
   }
+#endif
 
 SAD16XN(8)
 SAD16XN(16)
 SAD16XN(32)
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo);
+    const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    prod = vdotq_u32(prod, sad_lo_u8, ones);
+    prod = vdotq_u32(prod, sad_hi_u8, ones);
+  }
+  return prod;
+}
+
+static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t c_lo = vld1q_u8(second_pred);
+    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
+    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
+    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+    const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo);
+    const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+    prod = vdotq_u32(prod, sad_lo_u8, ones);
+    prod = vdotq_u32(prod, sad_hi_u8, ones);
+  }
+  return prod;
+}
+
+#define SAD32XN(n)                                                            \
+  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x4_t prod =                                                   \
+        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return horizontal_add_uint32x4(prod);                                     \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint32x4_t prod =                                                   \
+        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return horizontal_add_uint32x4(prod);                                     \
+  }
+
+#else
 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -250,11 +484,82 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
         sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint16x8(abs);                                      \
   }
+#endif
 
 SAD32XN(16)
 SAD32XN(32)
 SAD32XN(64)
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0);
+    const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1);
+    const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2);
+    const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    prod = vdotq_u32(prod, sad_0_u8, ones);
+    prod = vdotq_u32(prod, sad_1_u8, ones);
+    prod = vdotq_u32(prod, sad_2_u8, ones);
+    prod = vdotq_u32(prod, sad_3_u8, ones);
+  }
+  return prod;
+}
+
+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    const uint8x16_t c_0 = vld1q_u8(second_pred);
+    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
+    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
+    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
+    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
+    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
+    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
+    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+    const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0);
+    const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1);
+    const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2);
+    const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+    prod = vdotq_u32(prod, sad_0_u8, ones);
+    prod = vdotq_u32(prod, sad_1_u8, ones);
+    prod = vdotq_u32(prod, sad_2_u8, ones);
+    prod = vdotq_u32(prod, sad_3_u8, ones);
+  }
+  return prod;
+}
+#else
 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -332,6 +637,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
     return vpadalq_u16(sum, abs_1);
   }
 }
+#endif
 
 #define SAD64XN(n)                                                            \
   uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \

From 0b3d4114468e8b602ece9b28f2485c60394311ca Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 29 Jul 2022 15:42:15 -0700
Subject: [PATCH 375/926] Fix off-by-one error of max w/h in validate_config

Fix the off-by-one errors of maximum g_w and g_h in validate_config().

Bug: webm:1774
Change-Id: I343783d06c1f53222be2366be79171b214486201
---
 vp9/vp9_cx_iface.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 05ac9e1691..588d9d502e 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -170,8 +170,8 @@ static vpx_codec_err_t update_error_state(
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
                                        const struct vp9_extracfg *extra_cfg) {
-  RANGE_CHECK(cfg, g_w, 1, 65535);  // 16 bits available
-  RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_w, 1, 65536);  // 16 bits available
+  RANGE_CHECK(cfg, g_h, 1, 65536);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile, 3);

From 29db7fe9757549efa8df5cc6f3cff0ece7b4b0d1 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 27 Jul 2022 07:37:44 -0700
Subject: [PATCH 376/926] VPX: Add vp9_quantize_fp_32x32_avx2().

Up to 1.80x faster than vp9_quantize_fp_32x32_ssse3() for full
calculations.

Bug: b/237714063

Change-Id: Ic4ae4724fce7ac85c7a089535b16a999e02f0a10
---
 test/vp9_quantize_test.cc           |   3 +
 vp9/common/vp9_rtcd_defs.pl         |   2 +-
 vp9/encoder/x86/vp9_quantize_avx2.c | 121 ++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 97998eb08b..4bd573b4f3 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -598,6 +598,9 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true),
                       make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
                       make_tuple(&vpx_quantize_b_32x32_avx2,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index e6b65c96f0..a4d28f0ff5 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -132,7 +132,7 @@ ()
 specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64";
 
 add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp_32x32 neon avx2 vsx/, "$ssse3_x86_64";
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_block_error avx2 sse2/;
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index 5d02f4fe85..6ded0913a7 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -138,3 +138,124 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   *eob_ptr = get_max_eob(eob_max);
 }
+
+// Enable this flag when matching the optimized code to
+// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the
+// existing ssse3 code and quantize_fp_32x32_nz_c().
+//
+// #define MATCH_VP9_QUANTIZE_FP_32X32_C
+
+#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C
+static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i abs_dqcoeff =
+      _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+  const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+  store_tran_low(qcoeff, qcoeff_ptr);
+  store_tran_low(dqcoeff, dqcoeff_ptr);
+
+  *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  (void)thr;
+}
+#endif
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  const int32_t nzflag = _mm256_movemask_epi8(thr_mask);
+
+  if (nzflag) {
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask);
+#else
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+#endif
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i abs_dqcoeff =
+        _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+    const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
+}
+
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                      &dequant);
+  thr = _mm256_srli_epi16(dequant, 2);
+  quant = _mm256_slli_epi16(quant, 1);
+  {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)1);
+    round = _mm256_add_epi16(round, rnd);
+    round = _mm256_srai_epi16(round, 1);
+  }
+
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1));
+  quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+#else
+  quantize_fp_32x32_16_no_nzflag(
+      &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+      qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+#endif
+
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                         iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                         dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += 8 * 2;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}

From a55e248349a1d549dbb3d988ced7ff93ac20ca0d Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Tue, 2 Aug 2022 11:22:04 -0700
Subject: [PATCH 377/926] VPX: Add vp9_highbd_quantize_fp_avx2().

Up to 5.37x faster than vp9_highbd_quantize_fp_c() for full
calculations.

~1.6% overall encoder improvement for the test clip used.

Bug: b/237714063

Change-Id: I584fd1f60a3e02f1ded092de98970725fc66c5b8
---
 test/vp9_quantize_test.cc           |   3 +
 vp9/common/vp9_rtcd_defs.pl         |   1 +
 vp9/encoder/x86/vp9_quantize_avx2.c | 108 ++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 4bd573b4f3..a2dbfc541d 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -580,6 +580,9 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
                    &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
+                   true),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index a4d28f0ff5..b956877d3c 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -196,6 +196,7 @@ ()
   # ENCODEMB INVOKE
 
   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_fp avx2/;
 
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
 
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index 6ded0913a7..bd93e71e8a 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -259,3 +259,111 @@ void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   *eob_ptr = get_max_eob(eob_max);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
+  const __m128i v = _mm_load_si128((const __m128i *)val_ptr);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(v, zero);
+  const __m128i ac = _mm_unpackhi_epi16(v, zero);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void highbd_load_fp_values(
+    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = highbd_init_256(round_ptr);
+  *quant = highbd_init_256(quant_ptr);
+  *dequant = highbd_init_256(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
+    const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
+  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
+  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const int step = 8;
+  __m256i round, quant, dequant;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                        &dequant);
+
+  highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                     iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                     dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

From 2e61a623d4d5408c59f58e7bec713d789b27c3ef Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 3 Aug 2022 08:01:25 -0700
Subject: [PATCH 378/926] VPX: Add vp9_highbd_quantize_fp_32x32_avx2().

~4x faster than vp9_highbd_quantize_fp_32x32_c() for full
calculations.

Bug: b/237714063

Change-Id: Iff2182b8e7b1ac79811e33080d1f6cac6679382d
---
 test/vp9_quantize_test.cc           |  3 ++
 vp9/common/vp9_rtcd_defs.pl         |  1 +
 vp9/encoder/x86/vp9_quantize_avx2.c | 71 +++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index a2dbfc541d..b936350236 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -583,6 +583,9 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>,
                    &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
                    true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
+                   32, true),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index b956877d3c..1b23e8a66f 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -199,6 +199,7 @@ ()
   specialize qw/vp9_highbd_quantize_fp avx2/;
 
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+  specialize qw/vp9_highbd_quantize_fp_32x32 avx2/;
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index bd93e71e8a..7afeba32cd 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -366,4 +366,75 @@ void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   *eob_ptr = get_max_eob(eob_max);
 }
+
+static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr);
+  const __m256i tmp_rnd =
+      _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq =
+      _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+    const int16_t *iscan) {
+  const int step = 8;
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                        &dequant);
+  thr = _mm256_srli_epi32(dequant, 2);
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1));
+  quant = _mm256_slli_epi32(quant, 1);
+  round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1);
+
+  highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                           iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                           dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp_32x32(
+        &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+        qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH

From c9f049fd9164e0b5b950bdb8ac80186787b5b64c Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Fri, 5 Aug 2022 07:40:26 -0700
Subject: [PATCH 379/926] VPX: Add vpx_subtract_block_avx2().

~1.3x faster than vpx_subtract_block_sse2().

Based on aom_subtract_block_avx2().

Bug: b/241580104

Change-Id: I17da036363f213d53c6546c3e858e4c3cba44a5b
---
 test/vp9_subtract_test.cc    |  4 ++
 vpx_dsp/vpx_dsp.mk           |  1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  2 +-
 vpx_dsp/x86/subtract_avx2.c  | 96 ++++++++++++++++++++++++++++++++++++
 4 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 vpx_dsp/x86/subtract_avx2.c

diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index 211cc6c7ad..f634a032d8 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -133,6 +133,10 @@ INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest,
 INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_sse2));
 #endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_avx2));
+#endif
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_neon));
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index dd667195f5..ffe954832d 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -376,6 +376,7 @@ DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
 
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/subtract_avx2.c
 DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e7ad640af8..db211ed8ce 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -730,7 +730,7 @@ ()
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
 
 #
 # Single block SAD
diff --git a/vpx_dsp/x86/subtract_avx2.c b/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 0000000000..4d259ef5c5
--- /dev/null
+++ b/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+                                             const uint8_t *src_ptr,
+                                             const uint8_t *pred_ptr) {
+  const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+  const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+  const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+  const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+  const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+  const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+  _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+  _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+    const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+    const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 32:
+      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 64:
+      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    default:
+      vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}

From eaf0b5b47e3abe8a9c5e82bf2cb1528d2ed443fc Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 5 Aug 2022 17:32:12 -0700
Subject: [PATCH 380/926] Fix VP9 auto level

The iteration index is wrong, causing the starting level to be chosen
is "LEVEL_5_2", which is intended for videos of a large resolution.

Change-Id: Id88836981bdcbd7494bd06193d6a433ac75a6d2e
---
 vp9/encoder/vp9_ratectrl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 0852973914..4f356f9aed 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2584,7 +2584,7 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
       const uint32_t pic_breadth =
           VPXMAX(cpi->common.width, cpi->common.height);
       int i;
-      for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+      for (i = 0; i < VP9_LEVELS; ++i) {
         if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
             vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
           if (rc->min_gf_interval <=

From 4355a392e6b2dec619dde616028b5a91c8917c0c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 8 Aug 2022 11:28:27 -0700
Subject: [PATCH 381/926] vp9_cx_iface,encoder_encode: only calc ts when
 img!=NULL

avoid calculating the end timestamp when performing a flush to prevent
an implicit conversion warning when applying a non-zero offset to a 0
pts used in that case:
vp9/vp9_cx_iface.c:1361:50: runtime error: implicit conversion from type
'vpx_codec_pts_t' (aka 'long') of value -15 (64-bit, signed) to type
'unsigned long' changed the value to 18446744073709551601 (64-bit,
unsigned)

Bug: b/229626362
Change-Id: I68ba19b7d6de35cc185707dfb6b43406b7165035
---
 vp9/vp9_cx_iface.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 588d9d502e..02bd2e579b 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1357,8 +1357,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
-    int64_t dst_end_time_stamp =
-        timebase_units_to_ticks(timestamp_ratio, pts + duration);
     size_t size, cx_data_sz;
     unsigned char *cx_data;
 
@@ -1369,6 +1367,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
+      const int64_t dst_end_time_stamp =
+          timebase_units_to_ticks(timestamp_ratio, pts + duration);
       res = image2yuvconfig(img, &sd);
 
       // Store the original flags in to the frame buffer. Will extract the
@@ -1405,6 +1405,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       // compute first pass stats
       if (img) {
         int ret;
+        int64_t dst_end_time_stamp;
         vpx_codec_cx_pkt_t fps_pkt;
         ENCODE_FRAME_RESULT encode_frame_result;
         vp9_init_encode_frame_result(&encode_frame_result);
@@ -1430,6 +1431,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 #endif  // !CONFIG_REALTIME_ONLY
     } else {
       ENCODE_FRAME_RESULT encode_frame_result;
+      int64_t dst_end_time_stamp;
       vp9_init_encode_frame_result(&encode_frame_result);
       while (cx_data_sz >= ctx->cx_data_sz / 2 &&
              -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data,

From 3cf0a241569efd53fa9a9bd62d963278106816c6 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 5 Aug 2022 18:06:08 -0700
Subject: [PATCH 382/926] L2E: Add target level in GOP unit tests

Change-Id: Icecc3031e1052bb5a94f6c5957ec5190aae990ba
---
 test/vp9_ext_ratectrl_test.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index c954495dff..6687f7fec5 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -497,7 +497,7 @@ vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
   EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval + 1);
   EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
   EXPECT_EQ(gop_info->allow_alt_ref, 1);
   if (gop_info->is_key_frame) {
@@ -571,7 +571,7 @@ vpx_rc_status_t rc_get_gop_decision_short_no_arf(
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
   EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval + 1);
   EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
   EXPECT_EQ(gop_info->allow_alt_ref, 1);
   if (gop_info->is_key_frame) {
@@ -752,6 +752,7 @@ class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
       vpx_rc_funcs_t rc_funcs;
       rc_funcs.rc_type = VPX_RC_GOP_QP;
@@ -799,6 +800,7 @@ class ExtRateCtrlTestGOPShortOverlay
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_UNKNOWN);
 
       vpx_rc_funcs_t rc_funcs;
       rc_funcs.rc_type = VPX_RC_GOP_QP;
@@ -847,6 +849,7 @@ class ExtRateCtrlTestGOPShortNoARF
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
       vpx_rc_funcs_t rc_funcs;
       rc_funcs.rc_type = VPX_RC_GOP_QP;

From ec4aa6d1915a92172539f359ef9e1847ae9ff327 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 8 Aug 2022 10:12:39 -0700
Subject: [PATCH 383/926] Use level defined min gf interval

Assume the level definition of min_gf_interval is the minimum allowed
gf_interval. We take this level comformant min_gf_interval instead of
+1.

Change-Id: I9c7e62f210c95b356e9716579ee4c19638de8e35
---
 test/vp9_ext_ratectrl_test.cc | 6 +++---
 vp9/encoder/vp9_ratectrl.c    | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 6687f7fec5..16e3248f76 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -497,7 +497,7 @@ vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
   EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval + 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
   EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
   EXPECT_EQ(gop_info->allow_alt_ref, 1);
   if (gop_info->is_key_frame) {
@@ -571,7 +571,7 @@ vpx_rc_status_t rc_get_gop_decision_short_no_arf(
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
   EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval + 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
   EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
   EXPECT_EQ(gop_info->allow_alt_ref, 1);
   if (gop_info->is_key_frame) {
@@ -800,7 +800,7 @@ class ExtRateCtrlTestGOPShortOverlay
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_UNKNOWN);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
       vpx_rc_funcs_t rc_funcs;
       rc_funcs.rc_type = VPX_RC_GOP_QP;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 4f356f9aed..1ddf64d41a 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2589,8 +2589,7 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
             vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
           if (rc->min_gf_interval <=
               (int)vp9_level_defs[i].min_altref_distance) {
-            rc->min_gf_interval =
-                (int)vp9_level_defs[i].min_altref_distance + 1;
+            rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance;
             rc->max_gf_interval =
                 VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
           }

From 3c2b21c22e5d12fc790324ac2f78c029ccf694b3 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 8 Aug 2022 15:09:32 -0700
Subject: [PATCH 384/926] VPX: Fix vp9_quantize_fp_avx2() VS build error.

Add build fix for _mm256_extract_epi16() being undefined.

Bug: b/237714063

Change-Id: I855b1828ce1b6b2b2f063fe097999481881bf074
---
 vp9/encoder/x86/vp9_quantize_avx2.c       | 4 ++++
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index 7afeba32cd..15ce71c5c6 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -67,7 +67,11 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) {
   eob = _mm256_max_epi16(eob, eob_s);
   eob_s = _mm256_shufflelo_epi16(eob, 1);
   eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
   return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
 }
 
 static VPX_FORCE_INLINE void quantize_fp_16(
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index ec1110ff8c..cbc715c046 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -94,7 +94,11 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
   eob = _mm256_max_epi16(eob, eob_s);
   eob_s = _mm256_shufflelo_epi16(eob, 1);
   eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
   return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
 }
 
 static VPX_FORCE_INLINE void quantize(const __m256i *qp,

From 1e07619a0afd63dd746c97766f63cf0a9d9c7e65 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 10 Aug 2022 05:29:50 -0700
Subject: [PATCH 385/926] VPX: vp9_quantize_fp_neon() cleanup.

No change in performance.

Bug: b/237714063

Change-Id: I868cda7acb0de840fbc85b23f3e36c50b39c331b
---
 vp9/encoder/arm/neon/vp9_quantize_neon.c | 167 +++++++++++++----------
 1 file changed, 92 insertions(+), 75 deletions(-)

diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 236c3176c7..46c772da2e 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -26,9 +26,8 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
-                                               const int16x8_t dequant,
-                                               tran_low_t *dqcoeff) {
+static VPX_FORCE_INLINE void calculate_dqcoeff_and_store(
+    const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) {
   const int32x4_t dqcoeff_0 =
       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   const int32x4_t dqcoeff_1 =
@@ -42,6 +41,84 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
+static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
+                                                   int16x8_t v_eobmax,
+                                                   uint16x8_t v_nz_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]);
+  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+  const int16x8_t v_nz_iscan =
+      vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan_plus1);
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#ifdef __aarch64__
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif  // __aarch64__
+}
+
+static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
+                                            const int16_t *quant_ptr,
+                                            const int16_t *dequant_ptr,
+                                            int16x8_t *round, int16x8_t *quant,
+                                            int16x8_t *dequant) {
+  *round = vld1q_s16(round_ptr);
+  *quant = vld1q_s16(quant_ptr);
+  *dequant = vld1q_s16(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
+                                              int16x8_t *v_quant,
+                                              int16x8_t *v_dequant) {
+#ifdef __aarch64__
+  *v_round = vdupq_laneq_s16(*v_round, 1);
+  *v_quant = vdupq_laneq_s16(*v_quant, 1);
+  *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
+#else
+  *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1);
+  *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1);
+  *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_8(
+    const int16x8_t *v_round, const int16x8_t *v_quant,
+    const int16x8_t *v_dequant, const tran_low_t *coeff_ptr,
+    const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    int16x8_t *v_eobmax) {
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round);
+  const int32x4_t v_tmp_lo =
+      vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant));
+  const int32x4_t v_tmp_hi =
+      vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant));
+  const int16x8_t v_tmp2 =
+      vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+  const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+  calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+  *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -50,84 +127,24 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   int i;
-  const int16x8_t v_zero = vdupq_n_s16(0);
-  const int16x8_t v_one = vdupq_n_s16(1);
-  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  int16x8_t v_round, v_quant, v_dequant;
   (void)scan;
 
-  // adjust for dc
-  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
+                 &v_dequant);
   // process dc and the first seven ac coeffs
-  {
-    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
-    v_round = vmovq_n_s16(round_ptr[1]);
-    v_quant = vmovq_n_s16(quant_ptr[1]);
-    v_dequant = vmovq_n_s16(dequant_ptr[1]);
-  }
+  quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
+                dqcoeff_ptr, &v_eobmax);
+
   // now process the rest of the ac coeffs
+  update_fp_values(&v_round, &v_quant, &v_dequant);
   for (i = 8; i < count; i += 8) {
-    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
+    quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
+                  qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
   }
-#ifdef __aarch64__
-  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
-#else
-  {
-    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
-                                             vget_high_s16(v_eobmax_76543210));
-    const int64x1_t v_eobmax_xx32 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-    const int16x4_t v_eobmax_tmp =
-        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-    const int64x1_t v_eobmax_xxx3 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-    const int16x4_t v_eobmax_final =
-        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
-    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-  }
-#endif  // __aarch64__
+
+  *eob_ptr = get_max_eob(v_eobmax);
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {

From 33b385ec4e850b2663a0048667c34289e38473d2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 12 Aug 2022 22:24:39 -0700
Subject: [PATCH 386/926] configure: add -Wc++{14,17,20}-extensions

the snapshot of googletest and the test files themselves are targeting
c++11 currently; these warnings are supported by recent versions of
clang

Change-Id: I5d36b3bd4058ba1610f0c8b27cad27aadee85939
---
 configure | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/configure b/configure
index beea650329..425ab3cb30 100755
--- a/configure
+++ b/configure
@@ -666,6 +666,11 @@ process_toolchain() {
           check_add_cxxflags -Wno-psabi
         fi
 
+        # Enforce C++11 compatibility.
+        check_add_cxxflags -Wc++14-extensions
+        check_add_cxxflags -Wc++17-extensions
+        check_add_cxxflags -Wc++20-extensions
+
         # disable some warnings specific to libyuv.
         check_cxxflags -Wno-missing-declarations \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"

From 14b8eaf7da0ceb3dc802cae9cd344b8788154077 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sun, 14 Aug 2022 15:43:36 -0700
Subject: [PATCH 387/926] examples/svc_encodeframe.c: rm empty {}s in switch

these have been unnecessary since:
0e97e7049 remove spatial svc experiment

Bug: b/229626362
Change-Id: I57528af4dcb9092b752161c8eaba2e2808c29c5f
---
 examples/svc_encodeframe.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c
index 08bda0e5c9..003096e701 100644
--- a/examples/svc_encodeframe.c
+++ b/examples/svc_encodeframe.c
@@ -552,11 +552,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   iter = NULL;
   while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
     switch (cx_pkt->kind) {
-      case VPX_CODEC_PSNR_PKT: {
-      }
-        ++si->psnr_pkt_received;
-        break;
-      default: { break; }
+      case VPX_CODEC_PSNR_PKT: ++si->psnr_pkt_received; break;
+      default: break;
     }
   }
 

From 763167aac7e58452dd6db5ce4db6703b88a20dec Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 15 Aug 2022 06:14:49 -0700
Subject: [PATCH 388/926] VPX: Add vp9_highbd_quantize_fp_neon().

Up to 4.1x faster than vp9_highbd_quantize_fp_c() for full
calculations.

~1.3% overall encoder improvement for the test clip used.

Bug: b/237714063

Change-Id: I8c6466bdbcf1c398b1d8b03cab4165c1d8556b0c
---
 test/vp9_quantize_test.cc                | 11 +++
 vp9/common/vp9_rtcd_defs.pl              |  2 +-
 vp9/encoder/arm/neon/vp9_quantize_neon.c | 85 ++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index b936350236..5c75c4b082 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -616,6 +616,16 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_neon>,
+                                 &QuantFPWrapper<vp9_highbd_quantize_fp_c>,
+                                 VPX_BITS_12, 16, true)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
@@ -629,6 +639,7 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                                  VPX_BITS_8, 32, true)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1b23e8a66f..62a597af5a 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -196,7 +196,7 @@ ()
   # ENCODEMB INVOKE
 
   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_fp avx2/;
+  specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
   specialize qw/vp9_highbd_quantize_fp_32x32 avx2/;
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 46c772da2e..a085372868 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -278,3 +278,88 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
 #endif  // __aarch64__
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+                     int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+  //  const int abs_qcoeff = (int)((tmp * quant) >> 16);
+  const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Packed nz_qcoeff_mask. Used to find eob.
+  return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
+
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_round = vld1_s16(round_ptr);
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+  (void)scan;
+
+  // DC and first 3 AC
+  v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                   v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi =
+      highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  count -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                     v_quant_s32, v_dequant_s32, v_round_s32);
+    v_mask_hi =
+        highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                             v_quant_s32, v_dequant_s32, v_round_s32);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    count -= 8;
+  } while (count);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

From c13788bae9f19fe307cef0df9e638ca84bdd4e50 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Fri, 12 Aug 2022 07:15:33 -0700
Subject: [PATCH 389/926] vp9_quantize_fp_32x32_neon() cleanup.

No change in performance.

Bug: b/237714063

Change-Id: If6ad5fc27de4babe0bfff3fdbb4b7fd99a0544ab
---
 vp9/encoder/arm/neon/vp9_quantize_neon.c | 161 ++++++++---------------
 1 file changed, 57 insertions(+), 104 deletions(-)

diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 46c772da2e..ec749d5397 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -151,52 +151,30 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
 }
 
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
-  const int16x8_t neg_one = vdupq_n_s16(-1);
-
-  // ROUND_POWER_OF_TWO(round_ptr[], 1)
-  const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  const int16x8_t quant = vld1q_s16(quant_ptr);
-  const int16x4_t dequant = vld1_s16(dequant_ptr);
-  // dequant >> 2 is used similar to zbin as a threshold.
-  const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+static VPX_FORCE_INLINE void quantize_fp_32x32_8(
+    const int16x8_t *v_round, const int16x8_t *v_quant,
+    const int16x8_t *v_dequant, const int16x8_t *dequant_thresh,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) {
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_coeff_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_thr_mask =
+      vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh));
+  const int16x8_t v_tmp_rnd =
+      vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask);
+  const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant);
+  const int16x8_t v_qcoeff =
+      vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0));
 
-  // Process dc and the first seven ac coeffs.
-  const uint16x8_t v_iscan =
-      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-  const int16x8_t coeff_abs = vabsq_s16(coeff);
-  const int16x8_t dequant_mask =
-      vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
-
-  int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
   int32x4_t dqcoeff_0, dqcoeff_1;
-  uint16x8_t eob_max;
-  (void)scan;
-  (void)count;
-
-  // coeff * quant_ptr[]) >> 15
-  qcoeff = vqdmulhq_s16(qcoeff, quant);
-
-  // Restore sign.
-  qcoeff = veorq_s16(qcoeff, coeff_sign);
-  qcoeff = vsubq_s16(qcoeff, coeff_sign);
-  qcoeff = vandq_s16(qcoeff, dequant_mask);
-
-  // qcoeff * dequant[] / 2
-  dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
-  dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
-
+  dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant));
+  dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant));
   // Add 1 if negative to round towards zero because the C uses division.
   dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
   dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
 #if CONFIG_VP9_HIGHBITDEPTH
   vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
   vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
@@ -205,76 +183,51 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
                                                    vshrn_n_s32(dqcoeff_1, 1)));
 #endif
 
-  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
 
-  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+  *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
 
-  iscan += 8;
-  coeff_ptr += 8;
-  qcoeff_ptr += 8;
-  dqcoeff_ptr += 8;
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  int16x8_t eob_max = vdupq_n_s16(-1);
+  // ROUND_POWER_OF_TWO(round_ptr[], 1)
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
+  // dequant >> 2 is used similar to zbin as a threshold.
+  int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+  int i;
 
-  {
-    int i;
-    const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
-    const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
-    const int16x8_t dequant_thresh =
-        vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
-
-    // Process the rest of the ac coeffs.
-    for (i = 8; i < 32 * 32; i += 8) {
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-      const int16x8_t dequant_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
-
-      int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
-      int32x4_t dqcoeff_0, dqcoeff_1;
-
-      qcoeff = vqdmulhq_s16(qcoeff, quant);
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-      qcoeff = vandq_s16(qcoeff, dequant_mask);
-
-      dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
-      dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
-
-      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
-      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+  (void)scan;
+  (void)count;
 
-#if CONFIG_VP9_HIGHBITDEPTH
-      vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
-      vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
-#else
-      store_s16q_to_tran_low(
-          dqcoeff_ptr,
-          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
-#endif
+  // Process dc and the first seven ac coeffs.
+  quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+                      iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
 
-      eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+  update_fp_values(&round, &quant, &dequant);
+  dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1);
 
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+  iscan += 8;
+  coeff_ptr += 8;
+  qcoeff_ptr += 8;
+  dqcoeff_ptr += 8;
 
-      iscan += 8;
-      coeff_ptr += 8;
-      qcoeff_ptr += 8;
-      dqcoeff_ptr += 8;
-    }
+  // Process the rest of the ac coeffs.
+  for (i = 8; i < 32 * 32; i += 8) {
+    quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+                        iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
 
-#ifdef __aarch64__
-    *eob_ptr = vmaxvq_u16(eob_max);
-#else
-    {
-      const uint16x4_t eob_max_0 =
-          vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
-      const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
-      const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
-      vst1_lane_u16(eob_ptr, eob_max_2, 0);
-    }
-#endif  // __aarch64__
+    iscan += 8;
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
   }
+
+  *eob_ptr = get_max_eob(eob_max);
 }

From fecec6293f622c1df36dbfd131d34956b4b712d8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 15 Aug 2022 17:26:41 -0700
Subject: [PATCH 390/926] simple_encode.cc: clear -Wextra-semi-stmt warnings

fixes warnings of the form:
../vp9/simple_encode.cc:755:48: warning: empty expression statement has
no effect; remove unnecessary ';' to silence this warning
[-Wextra-semi-stmt]
  SET_STRUCT_VALUE(config, oxcf, ret, key_freq);

Bug: b/229626362
Change-Id: I1c9b0ae9927cdd7c31da000633bcb6e2b8242cd4
---
 vp9/simple_encode.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 654699e1b2..f42912d35b 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -744,10 +744,12 @@ static void UpdateGroupOfPicture(const VP9_COMP *cpi, int start_coding_index,
 }
 
 #define SET_STRUCT_VALUE(config, structure, ret, field) \
-  if (strcmp(config.name, #field) == 0) {               \
-    structure->field = atoi(config.value);              \
-    ret = 1;                                            \
-  }
+  do {                                                  \
+    if (strcmp(config.name, #field) == 0) {             \
+      structure->field = atoi(config.value);            \
+      ret = 1;                                          \
+    }                                                   \
+  } while (false)
 
 static void UpdateEncodeConfig(const EncodeConfig &config,
                                VP9EncoderConfig *oxcf) {

From 37a3999f5a70e5e88c6f22030ef8bb106990a8d7 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Tue, 16 Aug 2022 06:12:21 -0700
Subject: [PATCH 391/926] Add vp9_highbd_quantize_fp_32x32_neon().

Up to 2.6x faster than vp9_highbd_quantize_fp_32x32_c() for full
calculations.

Bug: b/237714063

Change-Id: Icfeff2ad4dcd57d0ceb47fe04789710807b9cbad
---
 test/vp9_quantize_test.cc                | 15 ++--
 vp9/common/vp9_rtcd_defs.pl              |  2 +-
 vp9/encoder/arm/neon/vp9_quantize_neon.c | 92 ++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 7 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 5c75c4b082..48c8180366 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -619,12 +619,15 @@ INSTANTIATE_TEST_SUITE_P(
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
-                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true),
-                      make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_neon>,
-                                 &QuantFPWrapper<vp9_highbd_quantize_fp_c>,
-                                 VPX_BITS_12, 16, true)));
+    ::testing::Values(
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_neon>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
+                   true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_neon>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
+                   32, true)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 62a597af5a..4290c2380e 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -199,7 +199,7 @@ ()
   specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
-  specialize qw/vp9_highbd_quantize_fp_32x32 avx2/;
+  specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 96dee5c6c1..945fd522e8 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -315,4 +315,96 @@ void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
   *eob_ptr = get_max_eob(v_eobmax);
 }
+
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+                           int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+  const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2);
+  const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+  // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+                                    vreinterpretq_s32_u32(v_mask));
+  //  const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+  const int32x4_t v_abs_qcoeff =
+      vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Packed nz_qcoeff_mask. Used to find eob.
+  return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
+
+void vp9_highbd_quantize_fp_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *round_ptr,
+    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+    const int16_t *iscan) {
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const int16x4_t v_round =
+      vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+  (void)scan;
+
+  // DC and first 3 AC
+  v_mask_lo =
+      highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                 v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi =
+      highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                                 v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  count -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo =
+        highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                   v_quant_s32, v_dequant_s32, v_round_s32);
+    v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4,
+                                           dqcoeff_ptr + 4, v_quant_s32,
+                                           v_dequant_s32, v_round_s32);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    count -= 8;
+  } while (count);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH

From d22e5a49e38a1749afdbff167dec81fb5061c5c6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 16 Aug 2022 13:57:25 -0700
Subject: [PATCH 392/926] configure: add -Wno-pass-failed for libyuv

with certain optimization flags or sanitizers enabled some code may fail
to vectorize:
third_party/libyuv/source/row_common.cc:3178:7: warning: loop not
vectorized: the optimizer was unable to perform the requested
transformation; the transformation might be disabled or specified as
part of an unsupported transformation ordering
[-Wpass-failed=transform-warning]

this was observed with integer/undefined sanitizers using clang 11/13

Bug: b/229626362
Change-Id: I01595c641763c4cd4242e02f2cc5cbabfe69d03e
---
 configure | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/configure b/configure
index 425ab3cb30..1b850b5e04 100755
--- a/configure
+++ b/configure
@@ -676,6 +676,8 @@ process_toolchain() {
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
         check_cxxflags -Wno-missing-prototypes \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
+        check_cxxflags -Wno-pass-failed \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
         check_cxxflags -Wno-unused-parameter \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
     fi

From c1fb6c6624072cef4785689d4f3ec02bff52266d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 16 Aug 2022 16:48:00 -0700
Subject: [PATCH 393/926] variance_sse2.c: add some missing casts

quiets integer sanitizer warnings of the form:
../vpx_dsp/x86/variance_sse2.c:100:10: runtime error: implicit
conversion from type 'unsigned int' of value 4294966272 (32-bit,
unsigned) to type 'int' changed the value to -1024 (32-bit, signed)

Bug: b/229626362
Change-Id: I150cc0a6a6b85143c3bf96886686fe3a40897db5
---
 vpx_dsp/x86/variance_sse2.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index fedc8b84e5..d6eb12da1a 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -85,7 +85,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_unpacklo_epi16(vsum, vsum);
   vsum = _mm_srai_epi32(vsum, 16);
-  *sum = add32x4_sse2(vsum);
+  *sum = (int)add32x4_sse2(vsum);
 }
 
 static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
@@ -97,7 +97,7 @@ static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
 // Can handle 1024 pixels' diff sum (such as 32x32)
 static INLINE int sum_final_sse2(const __m128i sum) {
   const __m128i t = sum_to_32bit_sse2(sum);
-  return add32x4_sse2(t);
+  return (int)add32x4_sse2(t);
 }
 
 static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
@@ -349,7 +349,7 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
@@ -369,7 +369,7 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
@@ -389,7 +389,7 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
 }
 

From d939886809d1bf8428d7bfb515fa8ff544c44fe7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 16 Aug 2022 16:52:06 -0700
Subject: [PATCH 394/926] load_unaligned_u32: use an int w/_mm_cvtsi32_si128

this matches the type of the function parameter; quiets integer
sanitizer warnings of the form:
implicit conversion from type 'uint32_t' (aka 'unsigned int') of value
3215646151 (32-bit, unsigned) to type 'int' changed the value to
-1079321145 (32-bit, signed)

Bug: b/229626362
Change-Id: Ia9a5dc5e1f57cbf4f8f8fa457bb674ef43369d37
---
 vpx_dsp/x86/mem_sse2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 6df4773f73..031f361a41 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -27,7 +27,7 @@ static INLINE int32_t loadu_int32(const void *src) {
 }
 
 static INLINE __m128i load_unaligned_u32(const void *a) {
-  uint32_t val;
+  int val;
   memcpy(&val, a, sizeof(val));
   return _mm_cvtsi32_si128(val);
 }

From b77b6b68d3e7bf54db7ac2996a6b103cafa7c32c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 16 Aug 2022 17:28:08 -0700
Subject: [PATCH 395/926] highbd_quantize_intrin_sse2: quiet int sanitizer
 warnings

add a missing cast in ^ operations; quiets warnings of the form:
implicit conversion from type 'int' of value -1 (32-bit, signed) to type
'unsigned int' changed the value to 4294967295 (32-bit, unsigned)

Bug: b/229626362
Change-Id: I56f74981050b2c9d00bad20e68f1b73ce7454729
---
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 4535a0f7a2..1264fbed22 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -82,7 +82,8 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
         const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
         const uint32_t abs_qcoeff =
             (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        qcoeff_ptr[k] =
+            (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
         dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
         if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
@@ -143,7 +144,7 @@ void vpx_highbd_quantize_b_32x32_sse2(
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
         (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }

From cb18d72c306b8400279191940ec0861af338bb5a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 16 Aug 2022 17:54:14 -0700
Subject: [PATCH 396/926] webmdec,WebmInputContext: make timestamp_ns signed

this matches the type returned from libwebm, which uses -1 as an error;
quiets integer sanitizer warnings of the form:
implicit conversion from type 'long long' of value -1 (64-bit, signed)
to type 'uint64_t' (aka 'unsigned long') changed the value to
18446744073709551615 (64-bit, unsigned)

Bug: b/229626362
Change-Id: Id3966912f802aee3c0f7852225b55f3057c3e76a
---
 webmdec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webmdec.h b/webmdec.h
index d8618b07d6..6ae7ee16d0 100644
--- a/webmdec.h
+++ b/webmdec.h
@@ -27,7 +27,7 @@ struct WebmInputContext {
   const void *block;
   int block_frame_index;
   int video_track_index;
-  uint64_t timestamp_ns;
+  int64_t timestamp_ns;
   int is_key_frame;
   int reached_eos;
 };

From a76a0228359723fe8b3c522ea0e7c2e2acb26ca8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 16 Aug 2022 17:46:24 -0700
Subject: [PATCH 397/926] vp8,VP8_COMP: normalize segment_encode_breakout type

use unsigned int as the API value is of this type; this quiets some
integer sanitizer warnings of the form:
implicit conversion from type 'unsigned int' of value 2147483648
(32-bit, unsigned) to type 'int' changed the value to -2147483648
(32-bit, signed)

Bug: b/229626362
Change-Id: I3d1ca618bf1b3cd57a5dca65a3067f351c1473f8
---
 test/encode_api_test.cc | 4 ++--
 vp8/encoder/onyx_int.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 08159148bd..ecdf928343 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -233,8 +233,8 @@ TEST(EncodeAPI, SetRoi) {
     roi.roi_map = roi_map;
     // VP8 only. This value isn't range checked.
     roi.static_threshold[1] = 1000;
-    roi.static_threshold[2] = INT_MIN;
-    roi.static_threshold[3] = INT_MAX;
+    roi.static_threshold[2] = UINT_MAX / 2 + 1;
+    roi.static_threshold[3] = UINT_MAX;
 
     for (const auto delta : { -63, -1, 0, 1, 63 }) {
       for (int i = 0; i < 8; ++i) {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 424f51b180..726dcc9466 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -483,7 +483,7 @@ typedef struct VP8_COMP {
 
   unsigned char *segmentation_map;
   signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
-  int segment_encode_breakout[MAX_MB_SEGMENTS];
+  unsigned int segment_encode_breakout[MAX_MB_SEGMENTS];
 
   unsigned char *active_map;
   unsigned int active_map_enabled;

From 9db0ec67e33fae4cfd066a7cf0fce9f81442c90e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 16 Aug 2022 21:59:20 -0700
Subject: [PATCH 398/926] vpx_encoder.h: make flag constants unsigned

this matches the type for vpx_codec_frame_flags_t and
vpx_codec_er_flags_t and quiets int sanitizer warnings of the form:

implicit conversion from type 'int' of value -9 (32-bit, signed) to type
'unsigned int' changed the value to 4294967287 (32-bit, unsigned)

Bug: b/229626362
Change-Id: Icfc5993250f37cedb300c7032cab28ce4bec1f86
---
 vpx/vpx_encoder.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 21254bb547..e776ec8136 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -115,14 +115,14 @@ typedef int64_t vpx_codec_pts_t;
  * support frame types that are codec specific (MPEG-1 D-frames for example)
  */
 typedef uint32_t vpx_codec_frame_flags_t;
-#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */
 /*!\brief frame can be dropped without affecting the stream (no future frame
  * depends on this one) */
-#define VPX_FRAME_IS_DROPPABLE 0x2
+#define VPX_FRAME_IS_DROPPABLE 0x2u
 /*!\brief frame should be decoded but will not be shown */
-#define VPX_FRAME_IS_INVISIBLE 0x4
+#define VPX_FRAME_IS_INVISIBLE 0x4u
 /*!\brief this is a fragment of the encoded frame */
-#define VPX_FRAME_IS_FRAGMENT 0x8
+#define VPX_FRAME_IS_FRAGMENT 0x8u
 
 /*!\brief Error Resilient flags
  *
@@ -132,12 +132,12 @@ typedef uint32_t vpx_codec_frame_flags_t;
  */
 typedef uint32_t vpx_codec_er_flags_t;
 /*!\brief Improve resiliency against losses of whole frames */
-#define VPX_ERROR_RESILIENT_DEFAULT 0x1
+#define VPX_ERROR_RESILIENT_DEFAULT 0x1u
 /*!\brief The frame partitions are independently decodable by the bool decoder,
  * meaning that partitions can be decoded even though earlier partitions have
  * been lost. Note that intra prediction is still done over the partition
  * boundary. */
-#define VPX_ERROR_RESILIENT_PARTITIONS 0x2
+#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u
 
 /*!\brief Encoder output packet variants
  *

From 7a0e2bf1bc8c6e4154deaa29c9a3de882fee2000 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Thu, 18 Aug 2022 12:19:08 -0700
Subject: [PATCH 399/926] Fix TEST_P(SADx4Test, DISABLED_Speed)

The reference code was being timed instead of the optimized code.

Change-Id: I67eb08dcda80e20eaa075dc2c91b7e8ef5c0cdfb
---
 test/sad_test.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 2506f1adbc..960bd499ba 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -517,14 +517,12 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   uint32_t reference_sad[4];
   DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
   vpx_usec_timer timer;
-
-  memset(reference_sad, 0, sizeof(reference_sad));
-  SADs(exp_sad);
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
+  }
   vpx_usec_timer_start(&timer);
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
-    for (int block = 0; block < 4; ++block) {
-      reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
-    }
+    SADs(exp_sad);
   }
   vpx_usec_timer_mark(&timer);
   for (int block = 0; block < 4; ++block) {

From df619cb823c99df68ba9cf4ce4cccb101b6080ac Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 18 Aug 2022 11:45:49 -0700
Subject: [PATCH 400/926] loopfilter.c: normalize flat func param type

flat/flat2 are stored as int8_t as returned by the filter_mask*
functions.

this quiets integer sanitizer warnings of the form:
vpx_dsp/loopfilter.c:197:28: runtime error: implicit conversion from
type 'int8_t' (aka 'signed char') of value -1 (8-bit, signed) to type
'uint8_t' (aka 'unsigned char') changed the value to 255 (8-bit,
unsigned)

Bug: b/229626362
Change-Id: Iacb6ae052d4cb2b6e0ebccbacf59ece9501d3b5f
---
 vpx_dsp/loopfilter.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 9956028317..d6504aab1f 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -159,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
                            uint8_t *oq2, uint8_t *oq3) {
@@ -232,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
-                            uint8_t flat2, uint8_t *op7, uint8_t *op6,
+static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
+                            int8_t flat2, uint8_t *op7, uint8_t *op6,
                             uint8_t *op5, uint8_t *op4, uint8_t *op3,
                             uint8_t *op2, uint8_t *op1, uint8_t *op0,
                             uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
@@ -505,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                   uint16_t *oq2, uint16_t *oq3, int bd) {
@@ -584,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
-                                   uint8_t flat2, uint16_t *op7, uint16_t *op6,
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
+                                   int8_t flat2, uint16_t *op7, uint16_t *op6,
                                    uint16_t *op5, uint16_t *op4, uint16_t *op3,
                                    uint16_t *op2, uint16_t *op1, uint16_t *op0,
                                    uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,

From 2694c7bc92463520d55764566ffebf694a98704a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 18 Aug 2022 11:56:25 -0700
Subject: [PATCH 401/926] update_thresh_freq_fact_row_mt: normalize param types

make source_variance unsigned; this matches update_thresh_freq_fact()
and the type of the MACROBLOCK member.

quiets integer sanitizer warnings of the form:
vp9/encoder/vp9_pickmode.c:2710:58: runtime error: implicit conversion
from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type
'int' changed the value to -1 (32-bit, signed)

Bug: b/229626362
Change-Id: I812c6ca914507bf25cad323dea3d91a3a2ea4f1d
---
 vp9/encoder/vp9_pickmode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 697c589ab3..90dbc427e7 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1112,7 +1112,7 @@ static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
 }
 
 static INLINE void update_thresh_freq_fact_row_mt(
-    VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
+    VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance,
     int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
     THR_MODES best_mode_idx, PREDICTION_MODE mode) {
   THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];

From c7358d801627ea7509a2a7c0d9fce9b72c14dfed Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 18 Aug 2022 11:57:32 -0700
Subject: [PATCH 402/926] vp9,search_new_mv: descale rather than scale sse

this changes from scaling best sse to downscaling base sse in
comparisons.

this quiets an integer sanitizer warning of the form:
vp9/encoder/vp9_pickmode.c:1632:41: runtime error: left shift of
4294967295 by 1 places cannot be represented in type 'unsigned int'

Bug: b/229626362
Change-Id: Iee2920474ba700a46177d4514ba6ef7691958069
---
 vp9/encoder/vp9_pickmode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 697c589ab3..d8fffe23ee 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1627,9 +1627,9 @@ static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x,
         return -1;
 
       // Exit NEWMV search if base_mv_sse is large.
-      if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+      if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar)
         return -1;
-      if (base_mv_sse < (best_sse_sofar << 1)) {
+      if ((base_mv_sse >> 1) < best_sse_sofar) {
         // Base layer mv is good.
         // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
         // (0, 0) mode is already tested.

From 002b6b1ce05c2810cb858188b29aafe785bbc01a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 17 Aug 2022 19:20:25 -0700
Subject: [PATCH 403/926] compiler_attributes.h: add
 VPX_NO_UNSIGNED_SHIFT_CHECK

and use it on MD5Transform(); this behavior is well defined and is only
a warning with -fsanitize=integer, not -fsanitize=undefined.

quiets warnings of the form:
md5_utils.c:163:3: runtime error: left shift of 143704723 by 7 places
cannot be represented in type 'unsigned int'

Bug: b/229626362
Change-Id: I60a384b2c2556f5ce71ad8ebce050329aba0b4e4
---
 md5_utils.c                     |  4 ++--
 vpx_ports/compiler_attributes.h | 12 +++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/md5_utils.c b/md5_utils.c
index c4106525f2..abd8d43c39 100644
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -151,8 +151,8 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
  * reflect the addition of 16 longwords of new data.  MD5Update blocks
  * the data and converts bytes into longwords for this routine.
  */
-VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
-                                                 UWORD32 const in[16]) {
+VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform(
+    UWORD32 buf[4], UWORD32 const in[16]) {
   UWORD32 a, b, c, d;
 
   a = buf[0];
diff --git a/vpx_ports/compiler_attributes.h b/vpx_ports/compiler_attributes.h
index 354352016c..4b468749b8 100644
--- a/vpx_ports/compiler_attributes.h
+++ b/vpx_ports/compiler_attributes.h
@@ -29,13 +29,23 @@
 #endif  // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
 
 #if defined(__clang__) && __has_attribute(no_sanitize)
+// Both of these have defined behavior and are used in certain operations or
+// optimizations thereof. There are cases where an overflow may be unintended,
+// however, so use of these attributes should be done with care.
 #define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
   __attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
+#if __clang_major__ >= 12
+#define VPX_NO_UNSIGNED_SHIFT_CHECK \
+  __attribute__((no_sanitize("unsigned-shift-base")))
+#endif  // __clang__ >= 12
+#endif  // __clang__
 
 #ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
 #define VPX_NO_UNSIGNED_OVERFLOW_CHECK
 #endif
+#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK
+#define VPX_NO_UNSIGNED_SHIFT_CHECK
+#endif
 
 //------------------------------------------------------------------------------
 // Variable attributes.

From b55ef982b0537b4c3d63486e55dcb8fff5fa1d78 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 18 Aug 2022 11:35:06 -0700
Subject: [PATCH 404/926] use VPX_NO_UNSIGNED_SHIFT_CHECK with entropy
 functions

these shift values off the most significant bit as part of the process;
vp8_regular_quantize_b_sse4_1 is included here for a special case of
mask creation

quiets warnings of the form:
vp8/decoder/dboolhuff.h:81:11: runtime error: left shift of
2373679303235599696 by 3 places cannot be represented in type
'VP8_BD_VALUE' (aka 'unsigned long')

vp8/encoder/bitstream.c:257:18: runtime error: left shift of 2147493041
by 1 places cannot be represented in type 'unsigned int'

vp8/encoder/x86/quantize_sse4.c:114:18: runtime error: left shift of
4294967294 by 1 places cannot be represented in type 'unsigned int'

vp9/encoder/vp9_pickmode.c:1632:41: runtime error: left shift of
4294967295 by 1 places cannot be represented in type 'unsigned int'

Bug: b/229626362
Change-Id: Iabed118b2a094232783e5ad0e586596d874103ca
---
 vp8/decoder/dboolhuff.h         | 4 +++-
 vp8/encoder/bitstream.c         | 5 ++++-
 vp8/encoder/x86/quantize_sse4.c | 5 ++++-
 vpx_dsp/bitwriter.h             | 5 ++++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index f2a18f0d90..673b2fbd5d 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -15,6 +15,7 @@
 #include <limits.h>
 
 #include "./vpx_config.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
@@ -50,7 +51,8 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source,
 
 void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
 
-static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br,
+                                                         int probability) {
   unsigned int bit = 0;
   VP8_BD_VALUE value;
   unsigned int split;
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 0e97af5f2e..190b013afd 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -19,6 +19,7 @@
 #include <limits.h>
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/system_state.h"
 #include "bitstream.h"
 
@@ -117,7 +118,9 @@ static void write_split(vp8_writer *bc, int x) {
                   vp8_mbsplit_encodings + x);
 }
 
-void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
+void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w,
+                                                 const TOKENEXTRA *p,
+                                                 int xcount) {
   const TOKENEXTRA *stop = p + xcount;
   unsigned int split;
   int shift;
diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c
index 6d03365fcb..4c2d24cc27 100644
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -13,8 +13,11 @@
 #include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_ports/bitops.h" /* get_lsb */
+#include "vpx_ports/compiler_attributes.h"
 
-void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
+// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask.
+VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b,
+                                                               BLOCKD *d) {
   int eob = -1;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
   __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr));
diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h
index 04084af8f2..5f1ee69ec2 100644
--- a/vpx_dsp/bitwriter.h
+++ b/vpx_dsp/bitwriter.h
@@ -13,6 +13,7 @@
 
 #include <stdio.h>
 
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 
 #include "vpx_dsp/prob.h"
@@ -35,7 +36,9 @@ typedef struct vpx_writer {
 void vpx_start_encode(vpx_writer *br, uint8_t *source);
 void vpx_stop_encode(vpx_writer *br);
 
-static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
+static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br,
+                                                         int bit,
+                                                         int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;

From 595bf7022afd2e7116bbfc0e1ee38c482e6d3811 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 18 Aug 2022 17:51:19 -0700
Subject: [PATCH 405/926] vp9.read_inter_block_mode_info: return on corruption

with block sizes < 8x8 previously only the inner loop was aborted. this
could cause propagation of invalid motion vectors to scale_mv().

this quiets integer sanitizer warnings of the form:
vp9/common/vp9_mvref_common.h:239:18: runtime error: implicit conversion
from type 'int' of value 32768 (32-bit, signed) to type 'int16_t' (aka
'short') changed the value to -32768 (16-bit, signed)

Bug: b/229626362
Change-Id: I58b5a425adf21542cbf4cc4dd5ab3cc5ed008264
---
 vp9/decoder/vp9_decodemv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8a8d2ad86e..f4bfb785f7 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -755,7 +755,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
         if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs,
                        best_sub8x8, is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
-          break;
+          return;
         }
 
         if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];

From ebf4caa85791f0fb35d7e7dc717d61652f53e6d8 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Fri, 12 Aug 2022 17:41:11 +0000
Subject: [PATCH 406/926] [NEON] Added vpx_highbd_quantize_b* functions

    Total gain for 12-bit encoding:
    * ~4.8% for best profile
    * ~6.2% for rt profile

Change-Id: I61e646ab7aedf06a25db1365d6d1cf7b05101c21
---
 test/vp9_quantize_test.cc          |  23 ++-
 vpx_dsp/arm/highbd_quantize_neon.c | 318 +++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk                 |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl       |   4 +-
 4 files changed, 337 insertions(+), 9 deletions(-)
 create mode 100644 vpx_dsp/arm/highbd_quantize_neon.c

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 48c8180366..4ecdd91b06 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -620,14 +620,23 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
-                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_neon>,
-                   &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
-                   true),
-        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_neon>,
-                   &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
-                   32, true)));
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..4ce432e1f9
--- /dev/null
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,318 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
+    const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
+    tran_low_t *dqcoeff_ptr) {
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_8_neon(
+    const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
+    const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
+    int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
+  const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
+  const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
+  const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
+
+  // Calculate 2 masks of elements outside the bin
+  const int32x4_t zbin_mask_0 =
+      vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
+  const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
+      vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
+
+  // Get the rounded values
+  const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
+  const int32x4_t rounded_1 =
+      vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
+
+  // (round * (quant << 15) * 2) >> 16 == (round * quant)
+  int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
+  int32x4_t qcoeff_tmp_1 =
+      vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
+
+  // Add rounded values
+  qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
+  qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
+
+  // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
+  qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
+  qcoeff_tmp_1 =
+      vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
+
+  // Restore the sign bit.
+  qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
+  qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
+  qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
+  qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
+
+  // Only keep the relevant coeffs
+  *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
+  *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t
+highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
+                       const int32x4_t round, const int32x4_t quant,
+                       const int32x4_t quant_shift, const int32x4_t dequant) {
+  int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+  const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+  highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+                         &qcoeff_0, &qcoeff_1);
+
+  // Store the 32-bit qcoeffs
+  vst1q_s32(qcoeff_ptr, qcoeff_0);
+  vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+  // Calculate and store the dqcoeffs
+  dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+  dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+  highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+  return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+
+  // Only the first element of each vector is DC.
+  // High half has identical elements, but we can reconstruct it from the low
+  // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+  // vector
+  int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr));
+  int32x4_t round = vmovl_s16(vld1_s16(round_ptr));
+  // Extend the quant, quant_shift vectors to ones of 32-bit elements
+  // scale to high-half, so we can use vqdmulhq_s32
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15);
+  int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+  // Process first 8 values which include a dc component.
+  {
+    // Add one because the eob does not index from 0.
+    const uint16x8_t v_iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+
+    const int16x8_t qcoeff =
+        highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                               quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  n_coeffs -= 8;
+
+  {
+    zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+    round = vdupq_lane_s32(vget_low_s32(round), 1);
+    quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+    quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+    dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+    do {
+      // Add one because the eob is not its index.
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+
+      const int16x8_t qcoeff =
+          highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                 round, quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+      n_coeffs -= 8;
+    } while (n_coeffs > 0);
+  }
+
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)n_coeffs;
+  (void)scan;
+}
+
+static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
+    int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+  dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+  dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
+    const int32x4_t quant, const int32x4_t quant_shift,
+    const int32x4_t dequant) {
+  int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+  const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+  highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+                         &qcoeff_0, &qcoeff_1);
+
+  // Store the 32-bit qcoeffs
+  vst1q_s32(qcoeff_ptr, qcoeff_0);
+  vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+  // Calculate and store the dqcoeffs
+  dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+  dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+  highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+  return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int i;
+
+  // Only the first element of each vector is DC.
+  // High half has identical elements, but we can reconstruct it from the low
+  // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+  // vector
+  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1);
+  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1);
+  // Extend the quant, quant_shift vectors to ones of 32-bit elements
+  // scale to high-half, so we can use vqdmulhq_s32
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16);
+  int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+  // Process first 8 values which include a dc component.
+  {
+    // Add one because the eob does not index from 0.
+    const uint16x8_t v_iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+
+    const int16x8_t qcoeff =
+        highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                     round, quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  {
+    zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+    round = vdupq_lane_s32(vget_low_s32(round), 1);
+    quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+    quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+    dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+    for (i = 1; i < 32 * 32 / 8; ++i) {
+      // Add one because the eob is not its index.
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+
+      const int16x8_t qcoeff =
+          highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                       round, quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+    }
+  }
+
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)n_coeffs;
+  (void)scan;
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index ffe954832d..4f17425ccd 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -333,6 +333,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/quantize_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_quantize_intrin_avx2.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_quantize_neon.c
 endif
 
 # avg
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index db211ed8ce..cab74f93e0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -718,10 +718,10 @@ ()
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2 avx2/;
+    specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
     add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2 avx2/;
+    specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 

From a8980078a133d98319468866580cfc866b0cc5d7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 22 Aug 2022 10:48:40 -0700
Subject: [PATCH 407/926] highbd_quantize_neon.c: remove unneeded assert.h

Change-Id: I041f5fb23b856a2b519669b5bf8a40d3772b4a6e
---
 vpx_dsp/arm/highbd_quantize_neon.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 4ce432e1f9..502a9c972d 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -9,7 +9,6 @@
  */
 
 #include <arm_neon.h>
-#include <assert.h>
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"

From d050161f0d627ab118308ab17ce8d0e040116459 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Fri, 19 Aug 2022 22:00:42 +0000
Subject: [PATCH 408/926] [NEON] Added vpx_highbd_sad* functions

    Total gain for 12-bit encoding:
    * ~7.8% for best profile
    * ~10% for rt profile

Change-Id: I89eda5c4372a5b628c9df84cdeb4c8486fc44789
---
 test/sad_test.cc              | 118 ++++++++++++++++++
 vpx_dsp/arm/highbd_sad_neon.c | 225 ++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk            |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  74 +++++------
 4 files changed, 383 insertions(+), 35 deletions(-)
 create mode 100644 vpx_dsp/arm/highbd_sad_neon.c

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 960bd499ba..4fb2af6244 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -727,6 +727,45 @@ const SadMxNParam neon_tests[] = {
   SadMxNParam(8, 4, &vpx_sad8x4_neon),
   SadMxNParam(4, 8, &vpx_sad4x8_neon),
   SadMxNParam(4, 4, &vpx_sad4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8),
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10),
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
@@ -744,6 +783,47 @@ const SadMxNAvgParam avg_neon_tests[] = {
   SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
   SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
   SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8),
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10),
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
 
@@ -761,6 +841,44 @@ const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
   SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
   SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8),
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10),
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 #endif  // HAVE_NEON
diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 0000000000..ecb52ce5a5
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,225 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
+                                                  int src_stride,
+                                                  const uint8_t *ref_ptr,
+                                                  int ref_stride, int width,
+                                                  int height) {
+  int i, j;
+  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j += 4) {
+      const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
+      const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
+      sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
+    }
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  }
+
+  return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
+                                                  int src_stride,
+                                                  const uint8_t *ref_ptr,
+                                                  int ref_stride, int width,
+                                                  int height) {
+  int i, j;
+  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j += 8) {
+      const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
+      const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
+      sum_abs_diff =
+          vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
+      sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
+                               vget_high_u16(ref_u16));
+    }
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  }
+
+  return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, const uint8_t *second_pred, int width, int height) {
+  int i, j;
+  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j += 4) {
+      const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
+      const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
+      const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
+      const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
+      sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
+    }
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred_ptr += width;
+  }
+
+  return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, const uint8_t *second_pred, int width, int height) {
+  int i, j;
+  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j += 8) {
+      const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
+      const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
+      const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
+      const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
+      sum_abs_diff =
+          vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
+      sum_abs_diff =
+          vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
+    }
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred_ptr += width;
+  }
+
+  return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+#define highbd_sad4MxN(m, n)                                                 \
+  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+  }
+
+#define highbd_sadMxN(m, n)                                                  \
+  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+  }
+
+#define highbd_sad4MxN_avg(m, n)                                          \
+  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, const uint8_t *second_pred) {                       \
+    return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
+                                second_pred, m, n);                       \
+  }
+
+#define highbd_sadMxN_avg(m, n)                                           \
+  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, const uint8_t *second_pred) {                       \
+    return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
+                                second_pred, m, n);                       \
+  }
+
+#define highbd_sadMxNx4D(m, n)                                                 \
+  void vpx_highbd_sad##m##x##n##x4d_neon(                                      \
+      const uint8_t *src_ptr, int src_stride,                                  \
+      const uint8_t *const ref_array[4], int ref_stride,                       \
+      uint32_t sad_array[4]) {                                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride,       \
+                                                    ref_array[i], ref_stride); \
+    }                                                                          \
+  }
+
+/* clang-format off */
+// 4x4
+highbd_sad4MxN(4, 4)
+highbd_sad4MxN_avg(4, 4)
+highbd_sadMxNx4D(4, 4)
+
+// 4x8
+highbd_sad4MxN(4, 8)
+highbd_sad4MxN_avg(4, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxN_avg(8, 4)
+highbd_sadMxNx4D(8, 4)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxN_avg(8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxN_avg(8, 16)
+highbd_sadMxNx4D(8, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxN_avg(16, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxN_avg(16, 16)
+highbd_sadMxNx4D(16, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxN_avg(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxN_avg(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxN_avg(32, 32)
+highbd_sadMxNx4D(32, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxN_avg(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxN_avg(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxN_avg(64, 64)
+highbd_sadMxNx4D(64, 64)
+    /* clang-format on */
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 4f17425ccd..1a03aed526 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 endif  # CONFIG_ENCODERS
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index cab74f93e0..72442ed41f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -940,41 +940,43 @@ ()
   # Single block SAD
   #
   add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64 sse2/;
+  specialize qw/vpx_highbd_sad64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32 sse2/;
+  specialize qw/vpx_highbd_sad64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64 sse2/;
+  specialize qw/vpx_highbd_sad32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32 sse2/;
+  specialize qw/vpx_highbd_sad32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16 sse2/;
+  specialize qw/vpx_highbd_sad32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32 sse2/;
+  specialize qw/vpx_highbd_sad16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16 sse2/;
+  specialize qw/vpx_highbd_sad16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8 sse2/;
+  specialize qw/vpx_highbd_sad16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x16 sse2/;
+  specialize qw/vpx_highbd_sad8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x8 sse2/;
+  specialize qw/vpx_highbd_sad8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x4 sse2/;
+  specialize qw/vpx_highbd_sad8x4 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x4 neon/;
 
   #
   # Avg
@@ -988,83 +990,85 @@ ()
   add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg sse2/;
+  specialize qw/vpx_highbd_sad64x64_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg sse2/;
+  specialize qw/vpx_highbd_sad64x32_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg sse2/;
+  specialize qw/vpx_highbd_sad32x64_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg sse2/;
+  specialize qw/vpx_highbd_sad32x32_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg sse2/;
+  specialize qw/vpx_highbd_sad32x16_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg sse2/;
+  specialize qw/vpx_highbd_sad16x32_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg sse2/;
+  specialize qw/vpx_highbd_sad16x16_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg sse2/;
+  specialize qw/vpx_highbd_sad16x8_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x16_avg sse2/;
+  specialize qw/vpx_highbd_sad8x16_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x8_avg sse2/;
+  specialize qw/vpx_highbd_sad8x8_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x4_avg sse2/;
+  specialize qw/vpx_highbd_sad8x4_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x8_avg neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x4_avg neon/;
 
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
   add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad64x64x4d sse2/;
+  specialize qw/vpx_highbd_sad64x64x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad64x32x4d sse2/;
+  specialize qw/vpx_highbd_sad64x32x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x64x4d sse2/;
+  specialize qw/vpx_highbd_sad32x64x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x32x4d sse2/;
+  specialize qw/vpx_highbd_sad32x32x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x16x4d sse2/;
+  specialize qw/vpx_highbd_sad32x16x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x32x4d sse2/;
+  specialize qw/vpx_highbd_sad16x32x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x16x4d sse2/;
+  specialize qw/vpx_highbd_sad16x16x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x8x4d sse2/;
+  specialize qw/vpx_highbd_sad16x8x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad8x16x4d sse2/;
+  specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad8x8x4d sse2/;
+  specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad8x4x4d sse2/;
+  specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad4x8x4d sse2/;
+  specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad4x4x4d sse2/;
+  specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
 
   #
   # Structured Similarity (SSIM)

From a6d95698fe79e7427596392c632b78f961861e98 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Mon, 22 Aug 2022 19:46:50 +0000
Subject: [PATCH 409/926] [NEON] Add vpx_highbd_subtract_block function

    Total gain for 12-bit encoding:
    * ~1% for best and rt profile

Change-Id: I4039120dc570baab1ae519a5e38b1acff38d81f0
---
 vpx_dsp/arm/subtract_neon.c  | 56 ++++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  1 +
 2 files changed, 57 insertions(+)

diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index 612897e247..2c008e48ab 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -79,3 +79,59 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
     } while (r);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
+                                    ptrdiff_t diff_stride,
+                                    const uint8_t *src8_ptr,
+                                    ptrdiff_t src_stride,
+                                    const uint8_t *pred8_ptr,
+                                    ptrdiff_t pred_stride, int bd) {
+  int r = rows, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+  (void)bd;
+
+  if (cols >= 16) {
+    do {
+      for (c = 0; c < cols; c += 16) {
+        const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
+        const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
+        const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
+        const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
+        const uint16x8_t d0 = vsubq_u16(s0, p0);
+        const uint16x8_t d1 = vsubq_u16(s1, p1);
+        vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
+        vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  } else if (cols >= 8) {
+    do {
+      for (c = 0; c < cols; c += 8) {
+        const uint16x8_t s = vld1q_u16(&src[c]);
+        const uint16x8_t p = vld1q_u16(&pred[c]);
+        const uint16x8_t d0 = vsubq_u16(s, p);
+        vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  } else if (cols >= 4) {
+    do {
+      for (c = 0; c < cols; c += 4) {
+        const uint16x4_t s = vld1_u16(&src[c]);
+        const uint16x4_t p = vld1_u16(&pred[c]);
+        const uint16x4_t v_diff = vsub_u16(s, p);
+        vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 72442ed41f..6cd46129f0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -935,6 +935,7 @@ ()
   # Block subtraction
   #
   add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block neon/;
 
   #
   # Single block SAD

From a689fe68a3e304df7f6ee3f711de108282da636f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 22 Aug 2022 19:33:26 -0700
Subject: [PATCH 410/926] vp9_ratectrl_rtc_test: initialize loopfilter_ctrl[]

this was added in:
  7beafefd1 vp9: Allow for disabling loopfilter per spatial layer
but the test doesn't zero initialize its svc_params_ member.

fixes the use of an uninitialized value, reported by valgrind and
integer sanitizer:
[ RUN      ] VP9/RcInterfaceSvcTest.Svc/0
==1064682== Conditional jump or move depends on uninitialised value(s)
==1064682==    at 0x1C5624: loopfilter_frame (vp9_encoder.c:3285)
==1064682==    by 0x1C9B54: encode_frame_to_data_rate (vp9_encoder.c:5595)
==1064682==    by 0x1CA2EE: SvcEncode (vp9_encoder.c:5789)
==1064682==    by 0x1CEA01: vp9_get_compressed_data (vp9_encoder.c:7891)
==1064682==    by 0x185F0E: encoder_encode (vp9_cx_iface.c:1437)
==1064682==    by 0x1503BB: vpx_codec_encode (vpx_encoder.c:208)

vp9/encoder/vp9_svc_layercontext.c:362:26: runtime error: implicit
conversion from type 'int' of value -1 (32-bit, signed) to type
'LOOPFILTER_CONTROL' changed the value to 4294967295 (32-bit, unsigned)
    #0 0x558925f45377 in vp9_restore_layer_context vp9/encoder/vp9_svc_layercontext.c:362:26
    #1 0x558925ef89fd in vp9_get_compressed_data vp9/encoder/vp9_encoder.c:7781:5
    #2 0x558925e3ef3e in encoder_encode vp9/vp9_cx_iface.c:1437:20

Bug: b/229626362
Change-Id: I33d244be7752c68b71efa9c62ca45d6b202ec761
---
 test/vp9_ratectrl_rtc_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index b09a45bb76..03a58fa926 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -271,6 +271,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
       svc_params_.max_quantizers[i] = 56;
       svc_params_.min_quantizers[i] = 2;
       svc_params_.speed_per_layer[i] = 7;
+      svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL;
     }
     cfg_.rc_end_usage = VPX_CBR;
     cfg_.g_lag_in_frames = 0;

From daae445b2a467ba701e87aac06bc9c98af31dfdd Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Sat, 20 Aug 2022 19:02:15 +0000
Subject: [PATCH 411/926] [NEON] Improve vpx_quantize_b* functions

Slight optimization, prefetch gives a 1% improvement in 1st pass

Change-Id: Iba4664964664234666406ab53893e02d481fbe61
---
 vpx_dsp/arm/quantize_neon.c | 269 +++++++++++++++++-------------------
 1 file changed, 130 insertions(+), 139 deletions(-)

diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index bd7818a074..dcdf588cbc 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -17,20 +17,57 @@
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                const int16x8_t dequant,
-                                               tran_low_t *dqcoeff) {
+                                               tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
   const int32x4_t dqcoeff_0 =
       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   const int32x4_t dqcoeff_1 =
       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  vst1q_s32(dqcoeff, dqcoeff_0);
-  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
 #else
-  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+  vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                const int16x8_t round, const int16x8_t quant,
+                const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+  qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *zbin_ptr, const int16_t *round_ptr,
                          const int16_t *quant_ptr,
@@ -41,106 +78,61 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
-  (void)scan;
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vld1q_s16(zbin_ptr);
+  int16x8_t round = vld1q_s16(round_ptr);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
   {
-    // Only the first element of each vector is DC.
-    const int16x8_t zbin = vld1q_s16(zbin_ptr);
-    const int16x8_t round = vld1q_s16(round_ptr);
-    const int16x8_t quant = vld1q_s16(quant_ptr);
-    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
-    const int16x8_t dequant = vld1q_s16(dequant_ptr);
     // Add one because the eob does not index from 0.
     const uint16x8_t v_iscan =
         vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
-    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-    const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-    const int16x8_t zbin_mask =
-        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-    qcoeff = vaddq_s16(qcoeff, rounded);
-
-    // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
-    qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
-    // Restore the sign bit.
-    qcoeff = veorq_s16(qcoeff, coeff_sign);
-    qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-    qcoeff = vandq_s16(qcoeff, zbin_mask);
+    const int16x8_t qcoeff =
+        quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+                        quant_shift, dequant);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
+    __builtin_prefetch(coeff_ptr + 64);
     coeff_ptr += 8;
     iscan += 8;
-
-    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
-
-    calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
   n_coeffs -= 8;
 
   {
-    const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
-    const int16x8_t round = vdupq_n_s16(round_ptr[1]);
-    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
-    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
 
     do {
       // Add one because the eob is not its index.
       const uint16x8_t v_iscan =
           vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-      const int16x8_t zbin_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-      qcoeff = vaddq_s16(qcoeff, rounded);
-
-      // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
-      qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
-      // Restore the sign bit.
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-      qcoeff = vandq_s16(qcoeff, zbin_mask);
+      const int16x8_t qcoeff =
+          quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                          quant, quant_shift, dequant);
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
+      __builtin_prefetch(coeff_ptr + 64);
       coeff_ptr += 8;
       iscan += 8;
-
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
-
-      calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
-
       n_coeffs -= 8;
     } while (n_coeffs > 0);
   }
@@ -156,6 +148,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)scan;
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -164,7 +159,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
 
 static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
                                                      const int16x8_t dequant,
-                                                     tran_low_t *dqcoeff) {
+                                                     tran_low_t *dqcoeff_ptr) {
   int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   int32x4_t dqcoeff_1 =
       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
@@ -176,14 +171,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
 #if CONFIG_VP9_HIGHBITDEPTH
   dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
   dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
-  vst1q_s32(dqcoeff, dqcoeff_0);
-  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
 #else
-  vst1q_s16(dqcoeff,
+  vst1q_s16(dqcoeff_ptr,
             vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                      const int16x8_t round, const int16x8_t quant,
+                      const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+  qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -198,103 +230,58 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
-  (void)scan;
-  (void)n_coeffs;  // Because we will always calculate 32*32.
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
   {
-    // Only the first element of each vector is DC.
-    const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-    const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-    const int16x8_t quant = vld1q_s16(quant_ptr);
-    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
-    const int16x8_t dequant = vld1q_s16(dequant_ptr);
     // Add one because the eob does not index from 0.
     const uint16x8_t v_iscan =
         vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
-    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-    const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-    const int16x8_t zbin_mask =
-        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-    qcoeff = vaddq_s16(qcoeff, rounded);
-
-    // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
-    qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
-    // Restore the sign bit.
-    qcoeff = veorq_s16(qcoeff, coeff_sign);
-    qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-    qcoeff = vandq_s16(qcoeff, zbin_mask);
+    const int16x8_t qcoeff =
+        quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                              quant, quant_shift, dequant);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
+    __builtin_prefetch(coeff_ptr + 64);
     coeff_ptr += 8;
     iscan += 8;
-
-    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
-
-    calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
   {
-    const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
-    const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
-    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
-    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
 
     for (i = 1; i < 32 * 32 / 8; ++i) {
       // Add one because the eob is not its index.
       const uint16x8_t v_iscan =
           vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-      const int16x8_t zbin_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-      qcoeff = vaddq_s16(qcoeff, rounded);
-
-      // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
-      qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
-      // Restore the sign bit.
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-      qcoeff = vandq_s16(qcoeff, zbin_mask);
+      const int16x8_t qcoeff =
+          quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                                quant, quant_shift, dequant);
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
+      __builtin_prefetch(coeff_ptr + 64);
       coeff_ptr += 8;
       iscan += 8;
-
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
-
-      calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
     }
   }
@@ -310,4 +297,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)n_coeffs;
+  (void)scan;
 }

From a3c9b9126decd2f9933005cf215107518a4413cb Mon Sep 17 00:00:00 2001
From: clang-format <noreply@google.com>
Date: Sat, 13 Aug 2022 10:33:56 -0700
Subject: [PATCH 412/926] .clang-format: update to clang-format-11

only store the deltas from --style Google in the file and reapply using
Debian clang-format version 11.1.0-6+build1

Bug: b/229626362
Change-Id: I3e18a2e7c17a90a48405b3cf1b37ebc652aba0db
---
 .clang-format                        | 142 +--------------------------
 examples/vp9_spatial_svc_encoder.c   |   4 +-
 vp8/common/mips/dspr2/filter_dspr2.c |  12 +--
 vp8/common/mips/msa/vp8_macros_msa.h |  18 ++--
 vp8/encoder/encodemv.c               |  10 +-
 vp8/encoder/firstpass.c              |  31 +++---
 vp8/encoder/mcomp.c                  |  29 +++---
 vp8/encoder/onyx_if.c                |   9 +-
 vp8/encoder/rdopt.c                  |   2 +-
 vp9/decoder/vp9_decodemv.c           |   4 +-
 vp9/decoder/vp9_detokenize.c         |  15 +--
 vp9/encoder/vp9_aq_cyclicrefresh.c   |   2 +-
 vp9/encoder/vp9_bitstream.c          |   6 +-
 vp9/encoder/vp9_denoiser.c           |   7 +-
 vp9/encoder/vp9_firstpass.c          |   9 +-
 vp9/encoder/vp9_temporal_filter.c    |  12 +--
 vpx_dsp/mips/macros_msa.h            |  46 ++++-----
 y4menc.c                             |  41 ++++----
 18 files changed, 129 insertions(+), 270 deletions(-)

diff --git a/.clang-format b/.clang-format
index 866b7e2117..a8bc4967c3 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,149 +1,9 @@
 ---
 Language:        Cpp
-# BasedOnStyle:  Google
-# Generated with clang-format 7.0.1
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
+BasedOnStyle:  Google
 AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
 Cpp11BracedListStyle: false
 DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakTemplateDeclaration: 10
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-ReflowComments:  true
 SortIncludes:    false
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Auto
-TabWidth:        8
-UseTab:          Never
-...
-
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index e85dbf8e71..d287e58319 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -1146,7 +1146,9 @@ int main(int argc, const char **argv) {
                       cx_pkt->data.twopass_stats.sz);
           break;
         }
-        default: { break; }
+        default: {
+          break;
+        }
       }
 
 #if CONFIG_VP9_DECODER && !SIMULCAST_MODE
diff --git a/vp8/common/mips/dspr2/filter_dspr2.c b/vp8/common/mips/dspr2/filter_dspr2.c
index e46827b0e4..b9da52084d 100644
--- a/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/vp8/common/mips/dspr2/filter_dspr2.c
@@ -816,8 +816,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     __asm__ __volatile__(
         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
@@ -832,8 +832,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     __asm__ __volatile__(
         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
@@ -848,8 +848,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     output_ptr += 48;
   }
diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index fde22f537c..7cb3c98690 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -122,10 +122,11 @@
     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
     uint32_t val_m;                                  \
                                                      \
-    asm volatile("lwr %[val_m], 0(%[psrc_m]) \n\t"   \
-                 "lwl %[val_m], 3(%[psrc_m]) \n\t"   \
-                 : [val_m] "=&r"(val_m)              \
-                 : [psrc_m] "r"(psrc_m));            \
+    asm volatile(                                    \
+        "lwr %[val_m], 0(%[psrc_m]) \n\t"            \
+        "lwl %[val_m], 3(%[psrc_m]) \n\t"            \
+        : [val_m] "=&r"(val_m)                       \
+        : [psrc_m] "r"(psrc_m));                     \
                                                      \
     val_m;                                           \
   })
@@ -136,10 +137,11 @@
     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
     uint64_t val_m = 0;                              \
                                                      \
-    asm volatile("ldr %[val_m], 0(%[psrc_m]) \n\t"   \
-                 "ldl %[val_m], 7(%[psrc_m]) \n\t"   \
-                 : [val_m] "=&r"(val_m)              \
-                 : [psrc_m] "r"(psrc_m));            \
+    asm volatile(                                    \
+        "ldr %[val_m], 0(%[psrc_m]) \n\t"            \
+        "ldl %[val_m], 7(%[psrc_m]) \n\t"            \
+        : [val_m] "=&r"(val_m)                       \
+        : [psrc_m] "r"(psrc_m));                     \
                                                      \
     val_m;                                           \
   })
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index c88ea1653e..384bb29389 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -31,17 +31,15 @@ static void encode_mvcomponent(vp8_writer *const w, const int v,
 
     vp8_write(w, 1, p[mvpis_short]);
 
-    do
+    do {
       vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
-    while (++i < 3);
+    } while (++i < 3);
 
     i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
 
-    do
+    do {
       vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
-    while (--i > 3);
+    } while (--i > 3);
 
     if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]);
   }
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index ed177e3cb6..65d2681c91 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -903,9 +903,9 @@ static double calc_correction_factor(double err_per_mb, double err_devisor,
   correction_factor = pow(error_term, power_term);
 
   /* Clip range */
-  correction_factor = (correction_factor < 0.05)
-                          ? 0.05
-                          : (correction_factor > 5.0) ? 5.0 : correction_factor;
+  correction_factor = (correction_factor < 0.05)  ? 0.05
+                      : (correction_factor > 5.0) ? 5.0
+                                                  : correction_factor;
 
   return correction_factor;
 }
@@ -947,11 +947,10 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
     }
 
     cpi->twopass.est_max_qcorrection_factor =
-        (cpi->twopass.est_max_qcorrection_factor < 0.1)
-            ? 0.1
-            : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-                  ? 10.0
-                  : cpi->twopass.est_max_qcorrection_factor;
+        (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1
+        : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+            ? 10.0
+            : cpi->twopass.est_max_qcorrection_factor;
   }
 
   /* Corrections for higher compression speed settings
@@ -1178,10 +1177,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
   } else {
     current_spend_ratio = (double)cpi->long_rolling_actual_bits /
                           (double)cpi->long_rolling_target_bits;
-    current_spend_ratio =
-        (current_spend_ratio > 10.0)
-            ? 10.0
-            : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+    current_spend_ratio = (current_spend_ratio > 10.0)  ? 10.0
+                          : (current_spend_ratio < 0.1) ? 0.1
+                                                        : current_spend_ratio;
   }
 
   /* Calculate a correction factor based on the quality of prediction in
@@ -1968,11 +1966,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 
   cpi->twopass.gf_group_bits =
-      (cpi->twopass.gf_group_bits < 0)
-          ? 0
-          : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-                ? cpi->twopass.kf_group_bits
-                : cpi->twopass.gf_group_bits;
+      (cpi->twopass.gf_group_bits < 0) ? 0
+      : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+          ? cpi->twopass.kf_group_bits
+          : cpi->twopass.gf_group_bits;
 
   /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
    * variability limit (cpi->oxcf.two_pass_vbrmax_section)
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index ae092c66e1..b92e2135e9 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -204,20 +204,21 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) {
 /* returns distortion + motion vector cost */
 #define ERR(r, c) (MVC(r, c) + DIST(r, c))
 /* checks if (r,c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                             \
-  do {                                                    \
-    IFMVCV(r, c,                                          \
-           {                                              \
-             thismse = DIST(r, c);                        \
-             if ((v = (MVC(r, c) + thismse)) < besterr) { \
-               besterr = v;                               \
-               br = r;                                    \
-               bc = c;                                    \
-               *distortion = thismse;                     \
-               *sse1 = sse;                               \
-             }                                            \
-           },                                             \
-           v = UINT_MAX;)                                 \
+#define CHECK_BETTER(v, r, c)                          \
+  do {                                                 \
+    IFMVCV(                                            \
+        r, c,                                          \
+        {                                              \
+          thismse = DIST(r, c);                        \
+          if ((v = (MVC(r, c) + thismse)) < besterr) { \
+            besterr = v;                               \
+            br = r;                                    \
+            bc = c;                                    \
+            *distortion = thismse;                     \
+            *sse1 = sse;                               \
+          }                                            \
+        },                                             \
+        v = UINT_MAX;)                                 \
   } while (0)
 
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index ffb3867dd1..94fb6e256e 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -4202,11 +4202,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
       }
 
       /* Clamp cpi->zbin_over_quant */
-      cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low)
-                                    ? zbin_oq_low
-                                    : (cpi->mb.zbin_over_quant > zbin_oq_high)
-                                          ? zbin_oq_high
-                                          : cpi->mb.zbin_over_quant;
+      cpi->mb.zbin_over_quant =
+          (cpi->mb.zbin_over_quant < zbin_oq_low)    ? zbin_oq_low
+          : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high
+                                                     : cpi->mb.zbin_over_quant;
 
       Loop = Q != last_q;
     } else {
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 5821fc7346..bbddacf8f0 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1608,7 +1608,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd,
       unsigned int q2dc = xd->block[24].dequant[0];
       /* If theres is no codeable 2nd order dc
          or a very small uniform pixel change change */
-      if ((sse - var<q2dc * q2dc>> 4) || (sse / 2 > var && sse - var < 64)) {
+      if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) {
         /* Check u and v to make sure skip is ok */
         unsigned int sse2 = VP8_UVSSE(x);
         if (sse2 * 2 < threshold) {
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index f4bfb785f7..db3e746639 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -426,7 +426,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
       zero_mv_pair(mv);
       break;
     }
-    default: { return 0; }
+    default: {
+      return 0;
+    }
   }
   return ret;
 }
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index c2e6b3d545..3ed1bd6ffa 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -133,17 +133,18 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
   int16_t dqv = dq[0];
   const uint8_t *const cat6_prob =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12)
-          ? vp9_cat6_prob_high12
-          : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 :
+      (xd->bd == VPX_BITS_12)   ? vp9_cat6_prob_high12
+      : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2
+                                :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                    vp9_cat6_prob;
+                                vp9_cat6_prob;
   const int cat6_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12) ? 18
-                              : (xd->bd == VPX_BITS_10) ? 16 :
+      (xd->bd == VPX_BITS_12)   ? 18
+      : (xd->bd == VPX_BITS_10) ? 16
+                                :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                                        14;
+                                14;
   // Keep value, range, and count as locals.  The compiler produces better
   // results with the locals than using r directly.
   BD_VALUE value = r->value;
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index e336179e90..90792aebea 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -471,7 +471,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
   cr->sb_index = i;
   cr->reduce_refresh = 0;
   if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
-    if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1;
+    if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1;
 }
 
 // Set cyclic refresh parameters.
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 75bd097f24..a84c8b524f 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -134,9 +134,9 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
   const TOKENEXTRA *p;
   const vp9_extra_bit *const extra_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (bit_depth == VPX_BITS_12)
-          ? vp9_extra_bits_high12
-          : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 : vp9_extra_bits;
+      (bit_depth == VPX_BITS_12)   ? vp9_extra_bits_high12
+      : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10
+                                   : vp9_extra_bits;
 #else
       vp9_extra_bits;
   (void)bit_depth;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 2885223b59..77d72396ae 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -233,7 +233,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
         frame == ALTREF_FRAME ||
         (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
         (frame != LAST_FRAME &&
-         ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
+         ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
           denoiser->denoising_level >= kDenHigh))) {
       frame = LAST_FRAME;
       ctx->newmv_sse = ctx->zeromv_lastref_sse;
@@ -764,8 +764,9 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold,
                                 VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
                                 int temporal_layer_id) {
   if (noise_level >= kDenLow && abs_sumdiff < 5)
-    return threshold *=
-           (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6;
+    return threshold *= (noise_level == kDenLow)   ? 2
+                        : (temporal_layer_id == 2) ? 10
+                                                   : 6;
   else
     return threshold;
 }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4682cc0030..e9250e25c0 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2113,11 +2113,10 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
   }
 
   // Clamp odd edge cases.
-  total_group_bits = (total_group_bits < 0)
-                         ? 0
-                         : (total_group_bits > twopass->kf_group_bits)
-                               ? twopass->kf_group_bits
-                               : total_group_bits;
+  total_group_bits = (total_group_bits < 0) ? 0
+                     : (total_group_bits > twopass->kf_group_bits)
+                         ? twopass->kf_group_bits
+                         : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
   if (total_group_bits > (int64_t)max_bits * gop_frames)
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 701bb89287..8af30c42aa 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -777,16 +777,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
           // is to weight all MBs equal.
-          blk_fw[0] = err < (thresh_low << THR_SHIFT)
-                          ? 2
-                          : err < (thresh_high << THR_SHIFT) ? 1 : 0;
+          blk_fw[0] = err < (thresh_low << THR_SHIFT)    ? 2
+                      : err < (thresh_high << THR_SHIFT) ? 1
+                                                         : 0;
           blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
         } else {
           use_32x32 = 0;
           for (k = 0; k < 4; k++)
-            blk_fw[k] = blk_bestsme[k] < thresh_low
-                            ? 2
-                            : blk_bestsme[k] < thresh_high ? 1 : 0;
+            blk_fw[k] = blk_bestsme[k] < thresh_low    ? 2
+                        : blk_bestsme[k] < thresh_high ? 1
+                                                       : 0;
         }
 
         for (k = 0; k < 4; k++) {
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index 3c2f50c790..d54ce53684 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -83,31 +83,33 @@
     val_lh_m;                                                    \
   })
 
-#define LW(psrc)                                                 \
-  ({                                                             \
-    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc);          \
-    uint32_t val_lw_m;                                           \
-                                                                 \
-    __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
-                         "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
-                         : [val_lw_m] "=&r"(val_lw_m)            \
-                         : [psrc_lw_m] "r"(psrc_lw_m));          \
-                                                                 \
-    val_lw_m;                                                    \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
+    uint32_t val_lw_m;                                  \
+                                                        \
+    __asm__ __volatile__(                               \
+        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"         \
+        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"         \
+        : [val_lw_m] "=&r"(val_lw_m)                    \
+        : [psrc_lw_m] "r"(psrc_lw_m));                  \
+                                                        \
+    val_lw_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                                 \
-  ({                                                             \
-    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);          \
-    uint64_t val_ld_m = 0;                                       \
-                                                                 \
-    __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
-                         "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
-                         : [val_ld_m] "=&r"(val_ld_m)            \
-                         : [psrc_ld_m] "r"(psrc_ld_m));          \
-                                                                 \
-    val_ld_m;                                                    \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+    uint64_t val_ld_m = 0;                              \
+                                                        \
+    __asm__ __volatile__(                               \
+        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"         \
+        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"         \
+        : [val_ld_m] "=&r"(val_ld_m)                    \
+        : [psrc_ld_m] "r"(psrc_ld_m));                  \
+                                                        \
+    val_ld_m;                                           \
   })
 #else  // !(__mips == 64)
 #define LD(psrc)                                                  \
diff --git a/y4menc.c b/y4menc.c
index 02b729e5bb..1877981279 100644
--- a/y4menc.c
+++ b/y4menc.c
@@ -17,39 +17,34 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height,
   const char *color;
   switch (bit_depth) {
     case 8:
-      color = fmt == VPX_IMG_FMT_I444
-                  ? "C444\n"
-                  : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n";
+      color = fmt == VPX_IMG_FMT_I444   ? "C444\n"
+              : fmt == VPX_IMG_FMT_I422 ? "C422\n"
+                                        : "C420jpeg\n";
       break;
     case 9:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p9 XYSCSS=444P9\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
-                                              : "C420p9 XYSCSS=420P9\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p9 XYSCSS=444P9\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
+                                          : "C420p9 XYSCSS=420P9\n";
       break;
     case 10:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p10 XYSCSS=444P10\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
-                                              : "C420p10 XYSCSS=420P10\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p10 XYSCSS=444P10\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
+                                          : "C420p10 XYSCSS=420P10\n";
       break;
     case 12:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p12 XYSCSS=444P12\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
-                                              : "C420p12 XYSCSS=420P12\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p12 XYSCSS=444P12\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
+                                          : "C420p12 XYSCSS=420P12\n";
       break;
     case 14:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p14 XYSCSS=444P14\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
-                                              : "C420p14 XYSCSS=420P14\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p14 XYSCSS=444P14\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
+                                          : "C420p14 XYSCSS=420P14\n";
       break;
     case 16:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p16 XYSCSS=444P16\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
-                                              : "C420p16 XYSCSS=420P16\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p16 XYSCSS=444P16\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
+                                          : "C420p16 XYSCSS=420P16\n";
       break;
     default: color = NULL; assert(0);
   }

From 2c7657202e839a661997f08c57554523901ad26d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 24 Aug 2022 15:48:24 -0700
Subject: [PATCH 413/926] vp8_ratectrl_rtc_test.cc: ensure frame_type is
 initialized

this fixes a valgrind failure:
==1095597== Conditional jump or move depends on uninitialised value(s)
==1095597==    at 0x12E0CC: (anonymous
namespace)::Vp8RcInterfaceTest::PreEncodeFrameHook(libvpx_test::VideoSource*,
libvpx_test::  > Encoder*) (vp8_ratectrl_rtc_test.cc:131)
==1095597==    by 0x1255A9:
libvpx_test::EncoderTest::RunLoop(libvpx_test::VideoSource*)
(encode_test_driver.cc:205)

Bug: webm:1776
Change-Id: Id3b40f62573ee513e79c74b6315c71b6ecd22c9a
Fixed: webm:1776
---
 test/vp8_ratectrl_rtc_test.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index ad310666e7..7410f3c01d 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -127,8 +127,7 @@ class Vp8RcInterfaceTest
         encoder->Control(VP8E_SET_CPUUSED, -6);
         encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
         encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
-      }
-      if (frame_params_.frame_type == INTER_FRAME) {
+      } else if (frame_params_.frame_type == INTER_FRAME) {
         // Disable golden frame update.
         frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
         frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;

From 7663fcb46733253f7ac6625aadc89cea08d942bf Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 24 Aug 2022 18:50:10 -0700
Subject: [PATCH 414/926] libs.doxy_template: remove obsolete CLASS_DIAGRAMS

This was reported with doxygen 1.9.4.

Also update the comment for CLASS_GRAPH by running "doxygen -u" because
the original comment for CLASS_GRAPH mentions the obsolete tag
'CLASS_DIAGRAMS',

Change-Id: I3bca547201f794d363bd814b7c7f7c9d7088797a
---
 libs.doxy_template | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/libs.doxy_template b/libs.doxy_template
index 73e1b43c72..1ee442af3e 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -1097,15 +1097,6 @@ EXTERNAL_GROUPS        = YES
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option is superseded by the HAVE_DOT option below. This is only a
-# fallback. It is recommended to install and use dot, since it yields more
-# powerful graphs.
-
-CLASS_DIAGRAMS         = YES
-
 # If set to YES, the inheritance and collaboration graphs will hide
 # inheritance and usage relations if the target is undocumented
 # or is not a class.
@@ -1119,10 +1110,14 @@ HIDE_UNDOC_RELATIONS   = YES
 
 HAVE_DOT               = NO
 
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
 
 CLASS_GRAPH            = YES
 

From 722d4daf3581703c10c3ff79022d5b92991be832 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 25 Aug 2022 10:50:16 -0700
Subject: [PATCH 415/926] vpx_encoder.h: note VPX_ERROR_RESILIENT_PARTITIONS is
 VP8-only

Change-Id: If71b2ec766f9f41253ce5a34987ffd208f9c8381
---
 vpx/vpx_encoder.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index e776ec8136..efaf5ef366 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -136,7 +136,8 @@ typedef uint32_t vpx_codec_er_flags_t;
 /*!\brief The frame partitions are independently decodable by the bool decoder,
  * meaning that partitions can be decoded even though earlier partitions have
  * been lost. Note that intra prediction is still done over the partition
- * boundary. */
+ * boundary.
+ * \note This is only supported by VP8.*/
 #define VPX_ERROR_RESILIENT_PARTITIONS 0x2u
 
 /*!\brief Encoder output packet variants

From 13970b7ecaa539fcb311325520d981ef006db105 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Wed, 24 Aug 2022 12:28:43 +0000
Subject: [PATCH 416/926] [NEON] Add highbd *variance* functions

        Total gain for 12-bit encoding:
        * ~7.2% for best profile
        * ~5.8% for rt profile

Change-Id: I5b70415fb89d1bbb02a0c139eb317ba6b08adede
---
 test/variance_test.cc              | 218 +++++++++++++
 vpx_dsp/arm/highbd_variance_neon.c | 502 +++++++++++++++++++++++++++++
 vpx_dsp/variance.c                 |   6 +-
 vpx_dsp/vpx_dsp.mk                 |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl       | 244 +++++++-------
 5 files changed, 860 insertions(+), 111 deletions(-)
 create mode 100644 vpx_dsp/arm/highbd_variance_neon.c

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 80855052dc..8aed5d2ed9 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1495,6 +1495,224 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
         SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12),
+        VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12),
+        VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12),
+        VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12),
+        VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12),
+        VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12),
+        VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12),
+        VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12),
+        VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12),
+        VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12),
+        VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12),
+        VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12),
+        VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12),
+        VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10),
+        VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10),
+        VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10),
+        VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10),
+        VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10),
+        VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10),
+        VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10),
+        VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10),
+        VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10),
+        VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10),
+        VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10),
+        VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10),
+        VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10),
+        VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8),
+        VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8),
+        VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8),
+        VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8),
+        VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8),
+        VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8),
+        VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8),
+        VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8),
+        VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8),
+        VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8),
+        VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8),
+        VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8),
+        VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon,
+                             12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon,
+                             12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon,
+                             12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
+                             12),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon,
+                             10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon,
+                             10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon,
+                             10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
+                             10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
+                             8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
+                             8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon,
+                             8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon,
+                             8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon,
+                             8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon,
+                             8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon,
+                             8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon,
+                             8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
+                             8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon,
+                             8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_neon,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
+                                12),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
+                                10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
+                                8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x8_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
+                                8)));
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..3a60a14ab8
--- /dev/null
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,502 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
+                                     const uint16_t *ref_ptr, int ref_stride,
+                                     int w, int h, uint64_t *sse,
+                                     int64_t *sum) {
+  int i, j;
+
+  if (w >= 8) {
+    int32x4_t sum_s32 = vdupq_n_s32(0);
+    uint32x4_t sse_u32 = vdupq_n_u32(0);
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j]));
+        const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j]));
+        const int32x4_t diff1_s32 =
+            vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16));
+        const int32x4_t diff2_s32 =
+            vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16));
+        const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32);
+        const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32);
+        sum_s32 = vaddq_s32(sum_s32, diff1_s32);
+        sum_s32 = vaddq_s32(sum_s32, diff2_s32);
+        sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32);
+        sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32);
+      }
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+    }
+    *sum = horizontal_add_int32x4(sum_s32);
+    *sse = horizontal_add_uint32x4(sse_u32);
+  } else {
+    int32x4_t sum_s32 = vdupq_n_s32(0);
+    uint32x4_t sse_u32 = vdupq_n_u32(0);
+    assert(w >= 4);
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 4) {
+        const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j]));
+        const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j]));
+        const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16);
+        const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32);
+        sum_s32 = vaddq_s32(sum_s32, diff_s32);
+        sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32);
+      }
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+    }
+    *sum = horizontal_add_int32x4(sum_s32);
+    *sse = horizontal_add_uint32x4(sse_u32);
+  }
+}
+
+static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
+                                     const uint8_t *ref8_ptr, int ref_stride,
+                                     int w, int h, uint64_t *sse,
+                                     int64_t *sum) {
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
+
+  if (w < 32 && h < 32) {
+    highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum);
+  } else {
+    uint64_t sse_long = 0;
+    int64_t sum_long = 0;
+    int k, l;
+    for (k = 0; k + 16 <= h; k += 16) {
+      for (l = 0; l + 16 <= w; l += 16) {
+        uint64_t sse_tmp = 0;
+        int64_t sum_tmp = 0;
+        highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16,
+                          16, &sse_tmp, &sum_tmp);
+        sum_long += sum_tmp;
+        sse_long += sse_tmp;
+      }
+      src_ptr += 16 * src_stride;
+      ref_ptr += 16 * ref_stride;
+    }
+    *sum = sum_long;
+    *sse = sse_long;
+  }
+}
+
+static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
+                                     const uint8_t *ref8_ptr, int ref_stride,
+                                     int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
+  *sse = (uint32_t)sse_long;
+  *sum = (int)sum_long;
+}
+
+static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
+                                      const uint8_t *ref8_ptr, int ref_stride,
+                                      int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
+                                      const uint8_t *ref8_ptr, int ref_stride,
+                                      int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H)                                                    \
+  uint32_t vpx_highbd_8_variance##W##x##H##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
+                      &sum);                                                \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_10_variance##W##x##H##_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_12_variance##W##x##H##_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }
+
+#define HIGHBD_GET_VAR(S)                                                   \
+  void vpx_highbd_8_get##S##x##S##var_neon(                                 \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse,  \
+                      sum);                                                 \
+  }                                                                         \
+                                                                            \
+  void vpx_highbd_10_get##S##x##S##var_neon(                                \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+                       sum);                                                \
+  }                                                                         \
+                                                                            \
+  void vpx_highbd_12_get##S##x##S##var_neon(                                \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+                       sum);                                                \
+  }
+
+#define HIGHBD_MSE(W, H)                                                    \
+  uint32_t vpx_highbd_8_mse##W##x##H##_neon(                                \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
+                      &sum);                                                \
+    return *sse;                                                            \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_10_mse##W##x##H##_neon(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    return *sse;                                                            \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_12_mse##W##x##H##_neon(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    return *sse;                                                            \
+  }
+
+static INLINE void highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  uint32_t i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+
+  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
+  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
+  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        uint32x4_t sum1_u32;
+        uint32x4_t sum2_u32;
+        uint16x4_t out1_u16;
+        uint16x4_t out2_u16;
+        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
+        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
+        sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+        sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
+        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
+        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
+        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
+        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
+      }
+      // Next row...
+      src_ptr += src_pixels_per_line;
+      output_ptr += output_width;
+    }
+  } else {
+    assert(output_width >= 4);
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 4) {
+        uint32x4_t sum_u32;
+        uint16x4_t out_u16;
+        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
+        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
+        sum_u32 = vmull_u16(filter1_u16, src1_u16);
+        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
+        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
+        vst1_u16(&output_ptr[j], out_u16);
+      }
+      // Next row...
+      src_ptr += src_pixels_per_line;
+      output_ptr += output_width;
+    }
+  }
+}
+
+static INLINE void highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  uint32_t i, j;
+
+  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
+  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
+  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        uint32x4_t sum1_u32;
+        uint32x4_t sum2_u32;
+        uint16x4_t out1_u16;
+        uint16x4_t out2_u16;
+        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
+        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
+        sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+        sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
+        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
+        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
+        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
+        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
+      }
+      // Next row...
+      src_ptr += src_pixels_per_line;
+      output_ptr += output_width;
+    }
+  } else {
+    assert(output_width >= 4);
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 4) {
+        uint32x4_t sum_u32;
+        uint16x4_t out_u16;
+        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
+        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
+        sum_u32 = vmull_u16(filter1_u16, src1_u16);
+        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
+        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
+        vst1_u16(&output_ptr[j], out_u16);
+      }
+      // Next row...
+      src_ptr += src_pixels_per_line;
+      output_ptr += output_width;
+    }
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H)                                                \
+  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
+                                                 ref_ptr, ref_stride, sse);    \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon(                   \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_10_variance##W##x##H##_neon(                             \
+        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon(                   \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_12_variance##W##x##H##_neon(                             \
+        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
+  }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
+  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon(                \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
+                                  H, temp2, W);                                \
+                                                                               \
+    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
+                                                 ref_ptr, ref_stride, sse);    \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon(               \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
+                                  H, temp2, W);                                \
+                                                                               \
+    return vpx_highbd_10_variance##W##x##H##_neon(                             \
+        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon(               \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
+                                  H, temp2, W);                                \
+                                                                               \
+    return vpx_highbd_12_variance##W##x##H##_neon(                             \
+        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
+  }
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  uint32x4_t one_u32 = vdupq_n_u32(1);
+  if (width >= 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 8) {
+        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
+        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
+        const uint32x4_t sum1_u32 =
+            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
+        const uint32x4_t sum2_u32 =
+            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
+        const uint16x4_t sum1_u16 =
+            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
+        const uint16x4_t sum2_u16 =
+            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
+        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
+        vst1q_u16(&comp_pred[j], vcomp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    assert(width >= 4);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 4) {
+        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
+        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
+        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
+        const uint16x4_t vcomp_pred =
+            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
+        vst1_u16(&comp_pred[j], vcomp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  }
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+  HIGHBD_VAR(W, H)             \
+  HIGHBD_SUBPIX_VAR(W, H)      \
+  HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 30b55dcb40..ce1e8382b9 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -549,9 +549,9 @@ HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred,
-                              int width, int height, const uint16_t *ref,
-                              int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred,
+                                int width, int height, const uint16_t *ref,
+                                int ref_stride) {
   int i, j;
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 1a03aed526..3019bff8f7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -430,6 +430,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 6cd46129f0..44dee56780 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1237,369 +1237,397 @@ ()
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance4x4 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance4x4 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance4x4 neon/;
 
   add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get16x16var sse2/;
+  specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get8x8var sse2/;
+  specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
 
   add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get16x16var sse2/;
+  specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get8x8var sse2/;
+  specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
 
   add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get16x16var sse2/;
+  specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get8x8var sse2/;
+  specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+  specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+  specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+  specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+  specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
+  specialize qw/vpx_highbd_comp_avg_pred neon/;
 
   #
   # Subpixel Variance
   #
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
 
 }  # CONFIG_VP9_HIGHBITDEPTH
 

From fd45d113807eb00fd5b9e58784e48e662a6797b9 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 26 Aug 2022 14:29:32 -0700
Subject: [PATCH 417/926] L2E: Add gop size and ARF existence to frame info

Pass the encode frame info to external ml model, with the information
of gop size and whether alt ref is used.

Change-Id: I55be2d3de83d7182c1a1a174e44ead7e19045c9d
---
 vp9/encoder/vp9_encoder.c      | 8 +++++++-
 vp9/encoder/vp9_ext_ratectrl.c | 4 +++-
 vp9/encoder/vp9_ext_ratectrl.h | 2 +-
 vpx/vpx_ext_ratectrl.h         | 8 ++++++++
 4 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 371779e772..91b64e5d13 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4501,11 +4501,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
       const RefCntBuffer *curr_frame_buf =
           get_ref_cnt_buffer(cm, cm->new_fb_idx);
+      // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+      // index 1 refers to the first encoding frame in a gf group.
+      // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+      // See function define_gf_group_structure().
+      const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
       get_ref_frame_bufs(cpi, ref_frame_bufs);
       codec_status = vp9_extrc_get_encodeframe_decision(
           &cpi->ext_ratectrl, curr_frame_buf->frame_index,
           cm->current_frame_coding_index, gf_group->index, update_type,
-          ref_frame_bufs, ref_frame_flags, &encode_frame_decision);
+          gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+          &encode_frame_decision);
       if (codec_status != VPX_CODEC_OK) {
         vpx_internal_error(&cm->error, codec_status,
                            "vp9_extrc_get_encodeframe_decision() failed");
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index d5b60b02a6..7e38cc5247 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -137,7 +137,7 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type,
+    FRAME_UPDATE_TYPE update_type, const int gop_size, const int use_alt_ref,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
   if (ext_ratectrl == NULL) {
@@ -150,6 +150,8 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     encode_frame_info.coding_index = coding_index;
     encode_frame_info.gop_index = gop_index;
     encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+    encode_frame_info.gop_size = gop_size;
+    encode_frame_info.use_alt_ref = use_alt_ref;
 
     vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
                            encode_frame_info.ref_frame_coding_indexes,
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index b46b776b91..b8f3d0c834 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -35,7 +35,7 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats(
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision);
 
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index b6c950d87e..95b883413e 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -100,6 +100,14 @@ typedef struct vpx_rc_encodeframe_info {
    * 1: Valid
    */
   int ref_frame_valid_list[3];
+  /*!
+   * The length of the current GOP.
+   */
+  int gop_size;
+  /*!
+   * Whether the current GOP uses an alt ref.
+   */
+  int use_alt_ref;
 } vpx_rc_encodeframe_info_t;
 
 /*!\brief Frame coding result

From 27fd546079a5566346b078754b51008ef46f5d2d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 26 Aug 2022 22:12:44 -0700
Subject: [PATCH 418/926] highbd_variance_neon,cosmetics: reorder a few lines

Change-Id: Ia6fa54652d7f94687e64108482bb0f28ca06cf49
---
 vpx_dsp/arm/highbd_variance_neon.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
index 3a60a14ab8..96a35af01c 100644
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -233,14 +233,12 @@ static INLINE void highbd_var_filter_block2d_bil_first_pass(
   if (output_width >= 8) {
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; j += 8) {
-        uint32x4_t sum1_u32;
-        uint32x4_t sum2_u32;
-        uint16x4_t out1_u16;
-        uint16x4_t out2_u16;
         const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
         const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
-        sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
-        sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+        uint16x4_t out1_u16;
+        uint16x4_t out2_u16;
         sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
         sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
         out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
@@ -255,11 +253,10 @@ static INLINE void highbd_var_filter_block2d_bil_first_pass(
     assert(output_width >= 4);
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; j += 4) {
-        uint32x4_t sum_u32;
-        uint16x4_t out_u16;
         const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
         const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
-        sum_u32 = vmull_u16(filter1_u16, src1_u16);
+        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
+        uint16x4_t out_u16;
         sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
         out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
         vst1_u16(&output_ptr[j], out_u16);
@@ -285,14 +282,12 @@ static INLINE void highbd_var_filter_block2d_bil_second_pass(
   if (output_width >= 8) {
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; j += 8) {
-        uint32x4_t sum1_u32;
-        uint32x4_t sum2_u32;
-        uint16x4_t out1_u16;
-        uint16x4_t out2_u16;
         const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
         const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
-        sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
-        sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+        uint16x4_t out1_u16;
+        uint16x4_t out2_u16;
         sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
         sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
         out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
@@ -307,11 +302,10 @@ static INLINE void highbd_var_filter_block2d_bil_second_pass(
     assert(output_width >= 4);
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; j += 4) {
-        uint32x4_t sum_u32;
-        uint16x4_t out_u16;
         const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
         const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
-        sum_u32 = vmull_u16(filter1_u16, src1_u16);
+        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
+        uint16x4_t out_u16;
         sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
         out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
         vst1_u16(&output_ptr[j], out_u16);

From 9d6d0624d7943a09cc0be9df1a7402522989ac1a Mon Sep 17 00:00:00 2001
From: Yaowu Xu <yaowu@google.com>
Date: Tue, 30 Aug 2022 09:04:58 -0700
Subject: [PATCH 419/926] Remove const for pass-by-value parameters

This also fixes MSVC compiler warnings.

Change-Id: I20dc9ac821275ba95598f3016fc6b23e884e13b7
---
 vp9/encoder/vp9_ext_ratectrl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 7e38cc5247..b4ee574ff1 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -137,7 +137,7 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type, const int gop_size, const int use_alt_ref,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
   if (ext_ratectrl == NULL) {

From 028fc1b50f196cab1ec93816654fbefe64f20cf3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 31 Aug 2022 16:35:08 -0700
Subject: [PATCH 420/926] test/*,cosmetics: normalize void parameter lists

replace (void) with (); use of this synonym is more common in C++ code.

Change-Id: I9813e82234dc9caa7115918a0491b0040f6afaf4
---
 test/acm_random.h             | 16 ++++++++--------
 test/error_resilience_test.cc |  2 +-
 test/md5_helper.h             |  2 +-
 test/svc_datarate_test.cc     |  2 +-
 test/vp8_datarate_test.cc     |  2 +-
 test/vp9_datarate_test.cc     |  2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/acm_random.h b/test/acm_random.h
index 3458340a12..c7122b9338 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -28,43 +28,43 @@ class ACMRandom {
   explicit ACMRandom(int seed) : random_(seed) {}
 
   void Reset(int seed) { random_.Reseed(seed); }
-  uint16_t Rand16(void) {
+  uint16_t Rand16() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     return (value >> 15) & 0xffff;
   }
 
-  int32_t Rand20Signed(void) {
+  int32_t Rand20Signed() {
     // Use 20 bits: values between 524287 and -524288.
     const uint32_t value = random_.Generate(1048576);
     return static_cast<int32_t>(value) - 524288;
   }
 
-  int16_t Rand16Signed(void) {
+  int16_t Rand16Signed() {
     // Use 16 bits: values between 32767 and -32768.
     return static_cast<int16_t>(random_.Generate(65536));
   }
 
-  int16_t Rand13Signed(void) {
+  int16_t Rand13Signed() {
     // Use 13 bits: values between 4095 and -4096.
     const uint32_t value = random_.Generate(8192);
     return static_cast<int16_t>(value) - 4096;
   }
 
-  int16_t Rand9Signed(void) {
+  int16_t Rand9Signed() {
     // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
     const uint32_t value = random_.Generate(512);
     return static_cast<int16_t>(value) - 256;
   }
 
-  uint8_t Rand8(void) {
+  uint8_t Rand8() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 23) & 0xff;
   }
 
-  uint8_t Rand8Extremes(void) {
+  uint8_t Rand8Extremes() {
     // Returns a random value near 0 or near 255, to better exercise
     // saturation behavior.
     const uint8_t r = Rand8();
@@ -82,7 +82,7 @@ class ACMRandom {
 
   int operator()(int n) { return PseudoUniform(n); }
 
-  static int DeterministicSeed(void) { return 0xbaba; }
+  static int DeterministicSeed() { return 0xbaba; }
 
  private:
   testing::internal::Random random_;
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 45a327ec2f..45138f14b9 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     duration_ = (last_pts_ + 1) * timebase_;
     if (cfg_.ts_number_layers > 1) {
       for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
diff --git a/test/md5_helper.h b/test/md5_helper.h
index dc28dc6283..9095d96a8a 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -47,7 +47,7 @@ class MD5 {
     MD5Update(&md5_, data, static_cast<uint32_t>(size));
   }
 
-  const char *Get(void) {
+  const char *Get() {
     static const char hex[16] = {
       '0', '1', '2', '3', '4', '5', '6', '7',
       '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 51e90e776c..010c273421 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -571,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_;
     duration_ = (last_pts_ + 1) * timebase_;
     for (int sl = 0; sl < number_spatial_layers_; ++sl) {
diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc
index dcd68a2d4c..64a861d15e 100644
--- a/test/vp8_datarate_test.cc
+++ b/test/vp8_datarate_test.cc
@@ -121,7 +121,7 @@ class DatarateTestLarge
     ++frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     if (bits_total_) {
       const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
 
diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index 9930c754c7..286fa335a1 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -199,7 +199,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
          ++layer) {
       duration_ = (last_pts_ + 1) * timebase_;

From 281dfae8353940fe380c73384607ec11a5c53f43 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 1 Sep 2022 18:47:50 -0700
Subject: [PATCH 421/926] neon,load_unaligned_*: use dup for lane 0

this produces better assembly with gcc (11.3.0-3); no change in assembly
using clang from the r24 android sdk (Android (8075178, based on
r437112b) clang version 14.0.1
(https://android.googlesource.com/toolchain/llvm-project
8671348b81b95fc603505dfc881b45103bee1731)

Change-Id: Ifec252d4f499f23be1cd94aa8516caf6b3fbbc11
---
 vpx_dsp/arm/mem_neon.h   | 8 ++++----
 vpx_dsp/arm/sad4d_neon.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 50aaa94fe0..84aae161b3 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -116,11 +116,11 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
                                           ptrdiff_t stride) {
   uint32_t a;
-  uint32x2_t a_u32 = vdup_n_u32(0);
+  uint32x2_t a_u32;
   if (stride == 4) return vld1_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vset_lane_u32(a, a_u32, 0);
+  a_u32 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   a_u32 = vset_lane_u32(a, a_u32, 1);
   return vreinterpret_u8_u32(a_u32);
@@ -143,11 +143,11 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
                                             ptrdiff_t stride) {
   uint32_t a;
-  uint32x4_t a_u32 = vdupq_n_u32(0);
+  uint32x4_t a_u32;
   if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  a_u32 = vdupq_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 1);
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 03f716c3d5..53866296ce 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -20,9 +20,9 @@
 static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
                                                  const void *const buf1) {
   uint32_t a;
-  uint32x2_t aa = vdup_n_u32(0);
+  uint32x2_t aa;
   memcpy(&a, buf0, 4);
-  aa = vset_lane_u32(a, aa, 0);
+  aa = vdup_n_u32(a);
   memcpy(&a, buf1, 4);
   aa = vset_lane_u32(a, aa, 1);
   return vreinterpret_u8_u32(aa);

From 447e27588032064c02eeb864d086881806038c35 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 2 Sep 2022 12:17:20 -0700
Subject: [PATCH 422/926] vpx_dsp,neon: simplify __ARM_FEATURE_DOTPROD check

only check that the macro is defined, the value doesn't have any effect.

from https://arm-software.github.io/acle/main/acle.html:

5.5.7.7.  Dot Product extension
  __ARM_FEATURE_DOTPROD is defined if the dot product data manipulation
  instructions are supported and the vector intrinsics are available.
  Note that this implies:
    - __ARM_NEON == 1

Change-Id: I164fe121ccefda99050a9b6a99738a2b518520f3
---
 vpx_dsp/arm/sad4d_neon.c         | 21 +++++++----------
 vpx_dsp/arm/sad_neon.c           | 40 +++++++++++++-------------------
 vpx_dsp/arm/variance_neon.c      |  8 +++----
 vpx_dsp/arm/vpx_convolve8_neon.c |  7 +++---
 vpx_dsp/arm/vpx_convolve8_neon.h |  5 ++--
 5 files changed, 34 insertions(+), 47 deletions(-)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 53866296ce..5fc621aee1 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
                               uint32x4_t *const sum) {
@@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
   vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
                               uint16x8_t *const sum) {
@@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
   sad_512_pel_final_neon(sum, sad_array);
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *const ref_array[4], int ref_stride,
@@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
@@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
@@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   sad_2048_pel_final_neon(sum, sad_array);
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
@@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
@@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   sad_4096_pel_final_neon(sum, sad_array);
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 34870375a3..4753aeaec6 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -21,8 +21,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride) {
   const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
   const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
   const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
   return horizontal_add_uint32x4(dp);
@@ -40,8 +39,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
   const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
   const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
   const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
   const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
   return horizontal_add_uint32x4(prod);
@@ -54,8 +52,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
 
 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride) {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
   uint32x4_t prod = vdupq_n_u32(0);
   const uint8x16_t ones = vdupq_n_u8(1);
   const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
@@ -88,8 +85,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              const uint8_t *second_pred) {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
   uint32x4_t prod = vdupq_n_u32(0);
   const uint8x16_t ones = vdupq_n_u8(1);
   const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
@@ -126,8 +122,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
 #endif
 }
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
                                const uint8_t *ref_ptr, int ref_stride,
                                const int height) {
@@ -182,7 +177,7 @@ static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
     return horizontal_add_uint32x2(prod);                                    \
   }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
                                const uint8_t *ref_ptr, int ref_stride,
                                const int height) {
@@ -233,14 +228,13 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
         sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
     return horizontal_add_uint16x8(abs);                                       \
   }
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 SAD8XN(4)
 SAD8XN(8)
 SAD8XN(16)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -294,7 +288,7 @@ static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint32x4(prod);                                     \
   }
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -348,14 +342,13 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint16x8(abs);                                      \
   }
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 SAD16XN(8)
 SAD16XN(16)
 SAD16XN(32)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -420,7 +413,7 @@ static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
     return horizontal_add_uint32x4(prod);                                     \
   }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -484,14 +477,13 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
         sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint16x8(abs);                                      \
   }
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 SAD32XN(16)
 SAD32XN(32)
 SAD32XN(64)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -559,7 +551,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
   }
   return prod;
 }
-#else
+#else   // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -637,7 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
     return vpadalq_u16(sum, abs_1);
   }
 }
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 #define SAD64XN(n)                                                            \
   uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 7b93f142b1..1b5cbcc46e 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -111,7 +111,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
   *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 // The variance helper functions use int16_t for sum. 8 values are accumulated
 // and then added (at which point they expand up to int32_t). To avoid overflow,
@@ -254,7 +254,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
       vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
@@ -421,7 +421,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
   return vget_lane_u32(sse, 0);
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
                                const unsigned char *ref_ptr, int ref_stride,
@@ -518,4 +518,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
   return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 06b58c438f..ca5222fa07 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,8 +31,7 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
@@ -764,7 +763,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
                                 const uint8x8_t s0, const uint8x8_t s1,
@@ -1694,4 +1693,4 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 857b6d54e2..b112cb249a 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -72,8 +72,7 @@ static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
   *s7 = vld1q_u8(s);
 }
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
                                                 const int8x16_t samples_hi,
@@ -171,7 +170,7 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
   return vqrshrun_n_s16(sum, 7);
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,

From 2faa4bfc5c28294e5080f328a5c5f10f6008d552 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 2 Sep 2022 16:17:52 -0700
Subject: [PATCH 423/926] x86,cosmetics: prefer
 _mm_setzero_si128/_mm256_setzero_si256

over *_set1_*(0)

Change-Id: I136e1798a2ce286480ebb9418db67a2f1e92b9a2
---
 vp9/encoder/x86/vp9_dct_intrin_sse2.c |  6 +++---
 vpx_dsp/x86/fwd_dct32x32_impl_avx2.h  |  2 +-
 vpx_dsp/x86/fwd_dct32x32_impl_sse2.h  |  2 +-
 vpx_dsp/x86/highbd_inv_txfm_sse2.h    |  2 +-
 vpx_dsp/x86/highbd_loopfilter_sse2.c  |  8 ++++----
 vpx_dsp/x86/inv_txfm_sse2.c           |  4 ++--
 vpx_dsp/x86/loopfilter_avx2.c         |  4 ++--
 vpx_dsp/x86/loopfilter_sse2.c         | 14 +++++++-------
 8 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 2188903b17..e9943447fd 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -111,7 +111,7 @@ static void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
   __m128i in7 = _mm_add_epi16(in[0], in[1]);
@@ -424,7 +424,7 @@ static void fadst8_sse2(__m128i *in) {
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__const_0 = _mm_setzero_si128();
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
@@ -1056,7 +1056,7 @@ static void fadst16_8col(__m128i *in) {
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 3f158b5e4e..f3a8020292 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
   const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
   const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kZero = _mm256_setzero_si256();
   const __m256i kOne = _mm256_set1_epi16(1);
   // Do the two transform/transpose passes
   int pass;
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index ac1246faa5..bf350b6da0 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -100,7 +100,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
   const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   const __m128i kOne = _mm_set1_epi16(1);
 
   // Do the two transform/transpose passes
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index 78cf9111d9..1d07391b02 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -249,7 +249,7 @@ static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
 
 static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
                                 const int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   // Faster than _mm_set1_epi16((1 << bd) - 1).
   const __m128i one = _mm_set1_epi16(1);
   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index d265fc1a92..9f45623dee 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -18,7 +18,7 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
   __m128i lbounded;
   __m128i retval;
 
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   __m128i t80, max, min;
 
@@ -51,7 +51,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   __m128i blimit_v, limit_v, thresh_v;
   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
@@ -492,7 +492,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
   DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
@@ -720,7 +720,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 4b02da9666..f42b3df849 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -243,7 +243,7 @@ void iadst8_sse2(__m128i *const in) {
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   __m128i s[8], u[16], v[8], w[16];
 
   // transpose
@@ -546,7 +546,7 @@ void vpx_iadst16_8col_sse2(__m128i *const in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
index be391992af..a58fb65539 100644
--- a/vpx_dsp/x86/loopfilter_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c
@@ -18,7 +18,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
                                 const unsigned char *limit,
                                 const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
@@ -372,7 +372,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index 347c9fdbe9..6ea34cdd16 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -106,7 +106,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
 
 void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i limit_v =
       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
                          _mm_loadl_epi64((const __m128i *)limit));
@@ -140,7 +140,7 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
 
 void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i limit_v =
       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
                          _mm_loadl_epi64((const __m128i *)limit));
@@ -232,7 +232,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
                                 const unsigned char *blimit,
                                 const unsigned char *limit,
                                 const unsigned char *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -594,7 +594,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
                                      const unsigned char *blimit,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -932,7 +932,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
   const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
@@ -1152,7 +1152,7 @@ void vpx_lpf_horizontal_8_dual_sse2(
   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit =
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
                          _mm_load_si128((const __m128i *)blimit1));
@@ -1406,7 +1406,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
   const __m128i thresh =
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
                          _mm_load_si128((const __m128i *)thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i mask, hev, flat;
 

From b3317970e702fe164240d71e73ca425efca72c46 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 2 Sep 2022 16:44:14 -0700
Subject: [PATCH 424/926] variance_neon.cc: simplify __ARM_FEATURE_DOTPROD
 check

missed in
447e27588 vpx_dsp,neon: simplify __ARM_FEATURE_DOTPROD check

+ fix #if comments

only check that the macro is defined, the value doesn't have any effect.

from https://arm-software.github.io/acle/main/acle.html:

5.5.7.7.  Dot Product extension
  __ARM_FEATURE_DOTPROD is defined if the dot product data manipulation
  instructions are supported and the vector intrinsics are available.
  Note that this implies:
    - __ARM_NEON == 1

Change-Id: I098b96421b7de5928bb3b11612ca1f32e7b6cbc4
---
 vpx_dsp/arm/variance_neon.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 1b5cbcc46e..f9969ed5a4 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -19,7 +19,7 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__ARM_FEATURE_DOTPROD)
 
 // Process a block of width 4 four rows at a time.
 static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
@@ -111,7 +111,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
   *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
 }
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 
 // The variance helper functions use int16_t for sum. 8 values are accumulated
 // and then added (at which point they expand up to int32_t). To avoid overflow,
@@ -254,7 +254,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
       vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
@@ -357,7 +357,7 @@ unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride,
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
 }
 
-#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__ARM_FEATURE_DOTPROD)
 
 unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
                                const unsigned char *ref_ptr, int ref_stride,
@@ -421,7 +421,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
   return vget_lane_u32(sse, 0);
 }
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 
 unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
                                const unsigned char *ref_ptr, int ref_stride,
@@ -518,4 +518,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
   return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // defined(__ARM_FEATURE_DOTPROD)

From a7527a26e84834ee9434dd3700c5faba0c839d6f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 2 Sep 2022 16:55:43 -0700
Subject: [PATCH 425/926] sad_neon: enable UDOT implementation w/aarch32

Change-Id: Ia28305ec5c61518b732cbacbd102acd2cb7f9d82
---
 vpx_dsp/arm/sad_neon.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 4753aeaec6..ad575d4aae 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -21,7 +21,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride) {
   const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
   const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
   const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
   return horizontal_add_uint32x4(dp);
@@ -39,7 +39,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
   const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
   const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
   const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
   const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
   return horizontal_add_uint32x4(prod);
@@ -52,7 +52,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
 
 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride) {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
   uint32x4_t prod = vdupq_n_u32(0);
   const uint8x16_t ones = vdupq_n_u8(1);
   const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
@@ -85,7 +85,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              const uint8_t *second_pred) {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
   uint32x4_t prod = vdupq_n_u32(0);
   const uint8x16_t ones = vdupq_n_u8(1);
   const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
@@ -122,7 +122,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
 #endif
 }
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
                                const uint8_t *ref_ptr, int ref_stride,
                                const int height) {
@@ -177,7 +177,7 @@ static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
     return horizontal_add_uint32x2(prod);                                    \
   }
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
                                const uint8_t *ref_ptr, int ref_stride,
                                const int height) {
@@ -228,13 +228,13 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
         sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
     return horizontal_add_uint16x8(abs);                                       \
   }
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 SAD8XN(4)
 SAD8XN(8)
 SAD8XN(16)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -288,7 +288,7 @@ static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint32x4(prod);                                     \
   }
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -342,13 +342,13 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint16x8(abs);                                      \
   }
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 SAD16XN(8)
 SAD16XN(16)
 SAD16XN(32)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -413,7 +413,7 @@ static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
     return horizontal_add_uint32x4(prod);                                     \
   }
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else  // defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -477,13 +477,13 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
         sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint16x8(abs);                                      \
   }
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 SAD32XN(16)
 SAD32XN(32)
 SAD32XN(64)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -551,7 +551,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
   }
   return prod;
 }
-#else   // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else   // !defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -629,7 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
     return vpadalq_u16(sum, abs_1);
   }
 }
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 #define SAD64XN(n)                                                            \
   uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \

From a46ca4b6bd8e5f360da74693bb43b400d29d91e8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 7 Sep 2022 18:41:13 -0700
Subject: [PATCH 426/926] vp8_decode: declare 2 variables volatile

fixes -Wclobbered warnings with gcc 12.1.0:
vp8/vp8_dx_iface.c|278 col 16| warning: variable 'w' might be clobbered
by 'longjmp' or 'vfork' [-Wclobbered]
vp8/vp8_dx_iface.c|278 col 19| warning: variable 'h' might be clobbered
by 'longjmp' or 'vfork' [-Wclobbered]

Change-Id: Ib2c606a3450188d7869c066cacaf5615d9746181
---
 vp8/vp8_dx_iface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 6d88e5154f..55a77ba7e5 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -275,7 +275,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                   void *user_priv, long deadline) {
   volatile vpx_codec_err_t res;
   volatile unsigned int resolution_change = 0;
-  unsigned int w, h;
+  volatile unsigned int w, h;
 
   if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) {
     return 0;

From 0d734728f6c2887a219de4bd118de8cd290f50c0 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Thu, 8 Sep 2022 13:05:55 -0700
Subject: [PATCH 427/926] Add vpx_highbd_sad16x{32,16,8}x4d_avx2.

1.98x to 2.3x faster than the sse2 version.

Bug: b/245917257

Change-Id: Ie4f9bb942ffaf4af7d395fb5a5978b41aabfc93c
---
 test/sad_test.cc                |  11 ++
 vpx_dsp/vpx_dsp.mk              |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl    |   6 +-
 vpx_dsp/x86/highbd_sad4d_avx2.c | 183 ++++++++++++++++++++++++++++++++
 4 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 vpx_dsp/x86/highbd_sad4d_avx2.c

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 4fb2af6244..92c9e6332a 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1079,6 +1079,17 @@ INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 const SadMxNx4Param x4d_avx2_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 3019bff8f7..34e9d736db 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -393,6 +393,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 endif  # CONFIG_ENCODERS
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 44dee56780..df2c8da74e 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1048,13 +1048,13 @@ ()
   specialize qw/vpx_highbd_sad32x16x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x32x4d sse2 neon/;
+  specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x16x4d sse2 neon/;
+  specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x8x4d sse2 neon/;
+  specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
new file mode 100644
index 0000000000..46c7e4fbc8
--- /dev/null
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -0,0 +1,183 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+                                          uint32_t sad_array[4]) {
+  const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+  const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+  const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+                                    _mm256_extractf128_si256(t2, 1));
+  _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; i++) {
+    __m256i r[4];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+    // absolute differences between every ref[] to src
+    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
+    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
+
+    // sum every abs diff
+    sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
+    sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
+    sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
+    sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_array[4],
+                                 int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < 2; ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 4;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
+void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_array[4],
+                                 int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_16[0] = _mm256_setzero_si256();
+  sums_16[1] = _mm256_setzero_si256();
+  sums_16[2] = _mm256_setzero_si256();
+  sums_16[3] = _mm256_setzero_si256();
+
+  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+  {
+    __m256i sums_32[4];
+    sums_32[0] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+    sums_32[1] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+    sums_32[2] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+    sums_32[3] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+    calc_final_4(sums_32, sad_array);
+  }
+}
+
+void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *const ref_array[4],
+                                int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_16[0] = _mm256_setzero_si256();
+  sums_16[1] = _mm256_setzero_si256();
+  sums_16[2] = _mm256_setzero_si256();
+  sums_16[3] = _mm256_setzero_si256();
+
+  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+  {
+    __m256i sums_32[4];
+    sums_32[0] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+    sums_32[1] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+    sums_32[2] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+    sums_32[3] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+    calc_final_4(sums_32, sad_array);
+  }
+}

From 33c43c14ee04e6bb74ce037d44b98c5ad4ea9474 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Sep 2022 15:35:13 -0700
Subject: [PATCH 428/926] Update third_party/googletest to v1.12.1

See https://github.com/google/googletest/releases/tag/release-1.12.1.

Modeled after https://aomedia-review.googlesource.com/c/aom/+/162601.

Change-Id: If0ced3097b4c8490985e3381aaac9b3266d52ae7
---
 third_party/googletest/README.libvpx          |   10 +-
 third_party/googletest/src/.clang-format      |    4 +
 third_party/googletest/src/CONTRIBUTORS       |    2 +
 third_party/googletest/src/README.md          |    8 +-
 .../include/gtest/gtest-assertion-result.h    |  237 +++
 .../src/include/gtest/gtest-death-test.h      |   89 +-
 .../src/include/gtest/gtest-matchers.h        |   76 +-
 .../src/include/gtest/gtest-message.h         |   27 +-
 .../src/include/gtest/gtest-param-test.h      |   87 +-
 .../src/include/gtest/gtest-printers.h        |   65 +-
 .../googletest/src/include/gtest/gtest-spi.h  |  132 +-
 .../src/include/gtest/gtest-test-part.h       |   14 +-
 .../src/include/gtest/gtest-typed-test.h      |   38 +-
 .../googletest/src/include/gtest/gtest.h      |  532 ++----
 .../src/include/gtest/gtest_pred_impl.h       |  200 +--
 .../googletest/src/include/gtest/gtest_prod.h |    9 +-
 .../include/gtest/internal/custom/README.md   |   12 -
 .../gtest/internal/custom/gtest-port.h        |   31 +
 .../internal/gtest-death-test-internal.h      |   74 +-
 .../include/gtest/internal/gtest-filepath.h   |   17 +-
 .../include/gtest/internal/gtest-internal.h   |  330 ++--
 .../include/gtest/internal/gtest-param-util.h |  145 +-
 .../include/gtest/internal/gtest-port-arch.h  |  100 +-
 .../src/include/gtest/internal/gtest-port.h   |  982 +++++------
 .../src/include/gtest/internal/gtest-string.h |   18 +-
 .../include/gtest/internal/gtest-type-util.h  |   21 +-
 third_party/googletest/src/src/gtest-all.cc   |    3 +-
 .../src/src/gtest-assertion-result.cc         |   77 +
 .../googletest/src/src/gtest-death-test.cc    |  520 +++---
 .../googletest/src/src/gtest-filepath.cc      |   78 +-
 .../googletest/src/src/gtest-internal-inl.h   |  217 ++-
 .../googletest/src/src/gtest-matchers.cc      |    5 +-
 third_party/googletest/src/src/gtest-port.cc  |  349 ++--
 .../googletest/src/src/gtest-printers.cc      |  124 +-
 .../googletest/src/src/gtest-test-part.cc     |   19 +-
 .../googletest/src/src/gtest-typed-test.cc    |    7 +-
 third_party/googletest/src/src/gtest.cc       | 1509 +++++++++--------
 third_party/googletest/src/src/gtest_main.cc  |    5 +-
 38 files changed, 3174 insertions(+), 2999 deletions(-)
 create mode 100644 third_party/googletest/src/.clang-format
 create mode 100644 third_party/googletest/src/include/gtest/gtest-assertion-result.h
 create mode 100644 third_party/googletest/src/src/gtest-assertion-result.cc

diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx
index b9a74922f0..5f6b01b0ec 100644
--- a/third_party/googletest/README.libvpx
+++ b/third_party/googletest/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest.git
-Version: release-1.11.0
+Version: release-1.12.1
 License: BSD
 License File: LICENSE
 
@@ -13,9 +13,17 @@ generation.
 
 Local Modifications:
 - Remove everything but:
+  .clang-format
   CONTRIBUTORS
   googletest/
    include
    README.md
    src
   LICENSE
+- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/
+- In googletest/include/gtest/internal/custom/gtest-port.h, define
+  GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix
+  the mingw32 g++ compilation errors caused by the lack of std::mutex
+  and std::condition_variable in the <mutex> and <condition_variable>
+  headers if mingw32 is configured with the win32 threads option. See
+  https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32
diff --git a/third_party/googletest/src/.clang-format b/third_party/googletest/src/.clang-format
new file mode 100644
index 0000000000..5b9bfe6d22
--- /dev/null
+++ b/third_party/googletest/src/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language:        Cpp
+BasedOnStyle:  Google
diff --git a/third_party/googletest/src/CONTRIBUTORS b/third_party/googletest/src/CONTRIBUTORS
index 76db0b40ff..77397a5b53 100644
--- a/third_party/googletest/src/CONTRIBUTORS
+++ b/third_party/googletest/src/CONTRIBUTORS
@@ -34,6 +34,7 @@ Manuel Klimek <klimek@google.com>
 Mario Tanev <radix@google.com>
 Mark Paskin
 Markus Heule <markus.heule@gmail.com>
+Martijn Vels <mvels@google.com>
 Matthew Simmons <simmonmt@acm.org>
 Mika Raento <mikie@iki.fi>
 Mike Bland <mbland@google.com>
@@ -55,6 +56,7 @@ Russ Rufer <russ@pentad.com>
 Sean Mcafee <eefacm@gmail.com>
 Sigurður Ásgeirsson <siggi@google.com>
 Sverre Sundsdal <sundsdal@gmail.com>
+Szymon Sobik <sobik.szymon@gmail.com>
 Takeshi Yoshino <tyoshino@google.com>
 Tracy Bialik <tracy@pentad.com>
 Vadim Berman <vadimb@google.com>
diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md
index 1f8b349ae7..d26b309ed0 100644
--- a/third_party/googletest/src/README.md
+++ b/third_party/googletest/src/README.md
@@ -25,7 +25,7 @@ When building GoogleTest as a standalone project, the typical workflow starts
 with
 
 ```
-git clone https://github.com/google/googletest.git -b release-1.10.0
+git clone https://github.com/google/googletest.git -b release-1.11.0
 cd googletest        # Main directory of the cloned repository.
 mkdir build          # Create a directory to hold the build output.
 cd build
@@ -94,7 +94,7 @@ include(FetchContent)
 FetchContent_Declare(
   googletest
   # Specify the commit you depend on and update it regularly.
-  URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
+  URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@@ -203,7 +203,9 @@ add
     -DGTEST_DONT_DEFINE_FOO=1
 
 to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
-to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`,
+`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`,
+`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For
 example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
     GTEST_TEST(SomeTest, DoesThis) { ... }
diff --git a/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/third_party/googletest/src/include/gtest/gtest-assertion-result.h
new file mode 100644
index 0000000000..addbb59c64
--- /dev/null
+++ b/third_party/googletest/src/include/gtest/gtest-assertion-result.h
@@ -0,0 +1,237 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements the AssertionResult type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251                                   \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename std::enable_if<
+          !std::is_convertible<T, AssertionResult>::value>::type*
+      /*enabler*/
+      = nullptr)
+      : success_(success) {}
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true if and only if the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != nullptr ? message_->c_str() : "";
+  }
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T>
+  AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == nullptr) message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-death-test.h b/third_party/googletest/src/include/gtest/gtest-death-test.h
index 9b4d4d1337..84e5a5bbd3 100644
--- a/third_party/googletest/src/include/gtest/gtest-death-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -27,21 +27,21 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for death tests.  It is
 // #included by gtest.h so a user doesn't need to include this
 // directly.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 
 #include "gtest/internal/gtest-death-test-internal.h"
 
-namespace testing {
-
 // This flag controls the style of death tests.  Valid values are "threadsafe",
 // meaning that the death test child process will re-execute the test binary
 // from the start, running only a single death test, or "fast",
@@ -49,6 +49,8 @@ namespace testing {
 // after forking.
 GTEST_DECLARE_string_(death_test_style);
 
+namespace testing {
+
 #if GTEST_HAS_DEATH_TEST
 
 namespace internal {
@@ -103,7 +105,6 @@ GTEST_API_ bool InDeathTestChild();
 //
 // On the regular expressions used in death tests:
 //
-//   GOOGLETEST_CM0005 DO NOT DELETE
 //   On POSIX-compliant systems (*nix), we use the <regex.h> library,
 //   which uses the POSIX extended regex syntax.
 //
@@ -169,24 +170,24 @@ GTEST_API_ bool InDeathTestChild();
 // Asserts that a given `statement` causes the program to exit, with an
 // integer exit status that satisfies `predicate`, and emitting error output
 // that matches `matcher`.
-# define ASSERT_EXIT(statement, predicate, matcher) \
-    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
+#define ASSERT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
 
 // Like `ASSERT_EXIT`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_EXIT(statement, predicate, matcher) \
-    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
 
 // Asserts that a given `statement` causes the program to exit, either by
 // explicitly exiting with a nonzero exit code or being killed by a
 // signal, and emitting error output that matches `matcher`.
-# define ASSERT_DEATH(statement, matcher) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+#define ASSERT_DEATH(statement, matcher) \
+  ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Like `ASSERT_DEATH`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_DEATH(statement, matcher) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+#define EXPECT_DEATH(statement, matcher) \
+  EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
 
@@ -197,22 +198,23 @@ class GTEST_API_ ExitedWithCode {
   ExitedWithCode(const ExitedWithCode&) = default;
   void operator=(const ExitedWithCode& other) = delete;
   bool operator()(int exit_status) const;
+
  private:
   const int exit_code_;
 };
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
-// GOOGLETEST_CM0006 DO NOT DELETE
 class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
   bool operator()(int exit_status) const;
+
  private:
   const int signum_;
 };
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
 // The death testing framework causes this to have interesting semantics,
@@ -257,23 +259,21 @@ class GTEST_API_ KilledBySignal {
 //   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
 // }, "death");
 //
-# ifdef NDEBUG
+#ifdef NDEBUG
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
+#define EXPECT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
+#define ASSERT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-# else
+#else
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
 
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
 // This macro is used for implementing macros such as
@@ -311,18 +311,17 @@ class GTEST_API_ KilledBySignal {
 //  statement unconditionally returns or throws. The Message constructor at
 //  the end allows the syntax of streaming additional messages into the
 //  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+                        << "Statement '" #statement "' cannot be verified.";   \
+  } else if (::testing::internal::AlwaysFalse()) {                             \
+    ::testing::internal::RE::PartialMatch(".*", (regex));                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    terminator;                                                                \
+  } else                                                                       \
+    ::testing::Message()
 
 // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
 // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
@@ -330,15 +329,15 @@ class GTEST_API_ KilledBySignal {
 // useful when you are combining death test assertions with normal test
 // assertions in one test.
 #if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  ASSERT_DEATH(statement, regex)
 #else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
 #endif
 
 }  // namespace testing
diff --git a/third_party/googletest/src/include/gtest/gtest-matchers.h b/third_party/googletest/src/include/gtest/gtest-matchers.h
index 9fa34a05ba..bffa00c533 100644
--- a/third_party/googletest/src/include/gtest/gtest-matchers.h
+++ b/third_party/googletest/src/include/gtest/gtest-matchers.h
@@ -32,6 +32,10 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 
@@ -98,11 +102,11 @@ class MatchResultListener {
  private:
   ::std::ostream* const stream_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+  MatchResultListener(const MatchResultListener&) = delete;
+  MatchResultListener& operator=(const MatchResultListener&) = delete;
 };
 
-inline MatchResultListener::~MatchResultListener() {
-}
+inline MatchResultListener::~MatchResultListener() {}
 
 // An instance of a subclass of this knows how to describe itself as a
 // matcher.
@@ -176,27 +180,39 @@ namespace internal {
 
 struct AnyEq {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a == b; }
+  bool operator()(const A& a, const B& b) const {
+    return a == b;
+  }
 };
 struct AnyNe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a != b; }
+  bool operator()(const A& a, const B& b) const {
+    return a != b;
+  }
 };
 struct AnyLt {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a < b; }
+  bool operator()(const A& a, const B& b) const {
+    return a < b;
+  }
 };
 struct AnyGt {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a > b; }
+  bool operator()(const A& a, const B& b) const {
+    return a > b;
+  }
 };
 struct AnyLe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a <= b; }
+  bool operator()(const A& a, const B& b) const {
+    return a <= b;
+  }
 };
 struct AnyGe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a >= b; }
+  bool operator()(const A& a, const B& b) const {
+    return a >= b;
+  }
 };
 
 // A match result listener that ignores the explanation.
@@ -205,7 +221,8 @@ class DummyMatchResultListener : public MatchResultListener {
   DummyMatchResultListener() : MatchResultListener(nullptr) {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+  DummyMatchResultListener(const DummyMatchResultListener&) = delete;
+  DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete;
 };
 
 // A match result listener that forwards the explanation to a given
@@ -217,7 +234,9 @@ class StreamMatchResultListener : public MatchResultListener {
       : MatchResultListener(os) {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+  StreamMatchResultListener(const StreamMatchResultListener&) = delete;
+  StreamMatchResultListener& operator=(const StreamMatchResultListener&) =
+      delete;
 };
 
 struct SharedPayloadBase {
@@ -284,17 +303,18 @@ class MatcherBase : private MatcherDescriberInterface {
   }
 
  protected:
-  MatcherBase() : vtable_(nullptr) {}
+  MatcherBase() : vtable_(nullptr), buffer_() {}
 
   // Constructs a matcher from its implementation.
   template <typename U>
-  explicit MatcherBase(const MatcherInterface<U>* impl) {
+  explicit MatcherBase(const MatcherInterface<U>* impl)
+      : vtable_(nullptr), buffer_() {
     Init(impl);
   }
 
   template <typename M, typename = typename std::remove_reference<
                             M>::type::is_gtest_matcher>
-  MatcherBase(M&& m) {  // NOLINT
+  MatcherBase(M&& m) : vtable_(nullptr), buffer_() {  // NOLINT
     Init(std::forward<M>(m));
   }
 
@@ -420,8 +440,8 @@ class MatcherBase : private MatcherDescriberInterface {
     static const M& Get(const MatcherBase& m) {
       // When inlined along with Init, need to be explicit to avoid violating
       // strict aliasing rules.
-      const M *ptr = static_cast<const M*>(
-          static_cast<const void*>(&m.buffer_));
+      const M* ptr =
+          static_cast<const M*>(static_cast<const void*>(&m.buffer_));
       return *ptr;
     }
     static void Init(MatcherBase& m, M impl) {
@@ -741,7 +761,7 @@ template <typename Rhs>
 class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
  public:
   explicit EqMatcher(const Rhs& rhs)
-      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
   static const char* Desc() { return "is equal to"; }
   static const char* NegatedDesc() { return "isn't equal to"; }
 };
@@ -749,7 +769,7 @@ template <typename Rhs>
 class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
  public:
   explicit NeMatcher(const Rhs& rhs)
-      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
   static const char* Desc() { return "isn't equal to"; }
   static const char* NegatedDesc() { return "is equal to"; }
 };
@@ -757,7 +777,7 @@ template <typename Rhs>
 class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
  public:
   explicit LtMatcher(const Rhs& rhs)
-      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
   static const char* Desc() { return "is <"; }
   static const char* NegatedDesc() { return "isn't <"; }
 };
@@ -765,7 +785,7 @@ template <typename Rhs>
 class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
  public:
   explicit GtMatcher(const Rhs& rhs)
-      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
   static const char* Desc() { return "is >"; }
   static const char* NegatedDesc() { return "isn't >"; }
 };
@@ -773,7 +793,7 @@ template <typename Rhs>
 class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
  public:
   explicit LeMatcher(const Rhs& rhs)
-      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
   static const char* Desc() { return "is <="; }
   static const char* NegatedDesc() { return "isn't <="; }
 };
@@ -781,7 +801,7 @@ template <typename Rhs>
 class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
  public:
   explicit GeMatcher(const Rhs& rhs)
-      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
   static const char* Desc() { return "is >="; }
   static const char* NegatedDesc() { return "isn't >="; }
 };
@@ -872,12 +892,16 @@ PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
 // Note: if the parameter of Eq() were declared as const T&, Eq("foo")
 // wouldn't compile.
 template <typename T>
-inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
+inline internal::EqMatcher<T> Eq(T x) {
+  return internal::EqMatcher<T>(x);
+}
 
 // Constructs a Matcher<T> from a 'value' of type T.  The constructed
 // matcher matches any value that's equal to 'value'.
 template <typename T>
-Matcher<T>::Matcher(T value) { *this = Eq(value); }
+Matcher<T>::Matcher(T value) {
+  *this = Eq(value);
+}
 
 // Creates a monomorphic matcher that matches anything with type Lhs
 // and equal to rhs.  A user may need to use this instead of Eq(...)
@@ -892,7 +916,9 @@ Matcher<T>::Matcher(T value) { *this = Eq(value); }
 // can always write Matcher<T>(Lt(5)) to be explicit about the type,
 // for example.
 template <typename Lhs, typename Rhs>
-inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) {
+  return Eq(rhs);
+}
 
 // Creates a polymorphic matcher that matches anything >= x.
 template <typename Rhs>
diff --git a/third_party/googletest/src/include/gtest/gtest-message.h b/third_party/googletest/src/include/gtest/gtest-message.h
index becfd49fcb..6c8bf90009 100644
--- a/third_party/googletest/src/include/gtest/gtest-message.h
+++ b/third_party/googletest/src/include/gtest/gtest-message.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the Message class.
@@ -42,7 +41,9 @@
 // to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
 // program!
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
@@ -110,8 +111,8 @@ class GTEST_API_ Message {
 
   // Streams a non-pointer value to this object.
   template <typename T>
-  inline Message& operator <<(const T& val) {
-    // Some libraries overload << for STL containers.  These
+  inline Message& operator<<(const T& val) {
+        // Some libraries overload << for STL containers.  These
     // overloads are defined in the global namespace instead of ::std.
     //
     // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
@@ -125,7 +126,7 @@ class GTEST_API_ Message {
     // from the global namespace.  With this using declaration,
     // overloads of << defined in the global namespace and those
     // visible via Koenig lookup are both exposed in this function.
-    using ::operator <<;
+    using ::operator<<;
     *ss_ << val;
     return *this;
   }
@@ -144,7 +145,7 @@ class GTEST_API_ Message {
   // ensure consistent result across compilers, we always treat NULL
   // as "(null)".
   template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
+  inline Message& operator<<(T* const& pointer) {  // NOLINT
     if (pointer == nullptr) {
       *ss_ << "(null)";
     } else {
@@ -159,25 +160,23 @@ class GTEST_API_ Message {
   // templatized version above.  Without this definition, streaming
   // endl or other basic IO manipulators to Message will confuse the
   // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
+  Message& operator<<(BasicNarrowIoManip val) {
     *ss_ << val;
     return *this;
   }
 
   // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
+  Message& operator<<(bool b) { return *this << (b ? "true" : "false"); }
 
   // These two overloads allow streaming a wide C string to a Message
   // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str);
-  Message& operator <<(wchar_t* wide_c_str);
+  Message& operator<<(const wchar_t* wide_c_str);
+  Message& operator<<(wchar_t* wide_c_str);
 
 #if GTEST_HAS_STD_WSTRING
   // Converts the given wide string to a narrow string using the UTF-8
   // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
+  Message& operator<<(const ::std::wstring& wstr);
 #endif  // GTEST_HAS_STD_WSTRING
 
   // Gets the text streamed to this object so far as an std::string.
@@ -196,7 +195,7 @@ class GTEST_API_ Message {
 };
 
 // Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+inline std::ostream& operator<<(std::ostream& os, const Message& sb) {
   return os << sb.GetString();
 }
 
diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h b/third_party/googletest/src/include/gtest/gtest-param-test.h
index 804e702817..b55119ac62 100644
--- a/third_party/googletest/src/include/gtest/gtest-param-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -26,11 +26,14 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Macros and functions for implementing parameterized tests
 // in Google C++ Testing and Mocking Framework (Google Test)
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
@@ -353,9 +356,7 @@ internal::ValueArray<T...> Values(T... v) {
 // }
 // INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
 //
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
 
 // Combine() allows the user to combine two or more sequences to produce
 // values of a Cartesian product of those sequences' elements.
@@ -428,8 +429,11 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,    \
-                                                           test_name));        \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
   };                                                                           \
   int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
                              test_name)::gtest_registering_dummy_ =            \
@@ -453,43 +457,42 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
 #define GTEST_GET_FIRST_(first, ...) first
 #define GTEST_GET_SECOND_(first, second, ...) second
 
-#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)                \
-  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>      \
-      gtest_##prefix##test_suite_name##_EvalGenerator_() {                    \
-    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));        \
-  }                                                                           \
-  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(   \
-      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {     \
-    if (::testing::internal::AlwaysFalse()) {                                 \
-      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(      \
-          __VA_ARGS__,                                                        \
-          ::testing::internal::DefaultParamName<test_suite_name::ParamType>,  \
-          DUMMY_PARAM_)));                                                    \
-      auto t = std::make_tuple(__VA_ARGS__);                                  \
-      static_assert(std::tuple_size<decltype(t)>::value <= 2,                 \
-                    "Too Many Args!");                                        \
-    }                                                                         \
-    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                 \
-        __VA_ARGS__,                                                          \
-        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,    \
-        DUMMY_PARAM_))))(info);                                               \
-  }                                                                           \
-  static int gtest_##prefix##test_suite_name##_dummy_                         \
-      GTEST_ATTRIBUTE_UNUSED_ =                                               \
-          ::testing::UnitTest::GetInstance()                                  \
-              ->parameterized_test_registry()                                 \
-              .GetTestSuitePatternHolder<test_suite_name>(                    \
-                  GTEST_STRINGIFY_(test_suite_name),                          \
-                  ::testing::internal::CodeLocation(__FILE__, __LINE__))      \
-              ->AddTestSuiteInstantiation(                                    \
-                  GTEST_STRINGIFY_(prefix),                                   \
-                  &gtest_##prefix##test_suite_name##_EvalGenerator_,          \
-                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,       \
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)               \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>     \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                   \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));       \
+  }                                                                          \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(  \
+      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {    \
+    if (::testing::internal::AlwaysFalse()) {                                \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(     \
+          __VA_ARGS__,                                                       \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+          DUMMY_PARAM_)));                                                   \
+      auto t = std::make_tuple(__VA_ARGS__);                                 \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                \
+                    "Too Many Args!");                                       \
+    }                                                                        \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                \
+        __VA_ARGS__,                                                         \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,   \
+        DUMMY_PARAM_))))(info);                                              \
+  }                                                                          \
+  static int gtest_##prefix##test_suite_name##_dummy_                        \
+      GTEST_ATTRIBUTE_UNUSED_ =                                              \
+          ::testing::UnitTest::GetInstance()                                 \
+              ->parameterized_test_registry()                                \
+              .GetTestSuitePatternHolder<test_suite_name>(                   \
+                  GTEST_STRINGIFY_(test_suite_name),                         \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))     \
+              ->AddTestSuiteInstantiation(                                   \
+                  GTEST_STRINGIFY_(prefix),                                  \
+                  &gtest_##prefix##test_suite_name##_EvalGenerator_,         \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,      \
                   __FILE__, __LINE__)
 
-
 // Allow Marking a Parameterized test class as not needing to be instantiated.
-#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                   \
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                  \
   namespace gtest_do_not_use_outside_namespace_scope {}                   \
   static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
       GTEST_STRINGIFY_(T))
diff --git a/third_party/googletest/src/include/gtest/gtest-printers.h b/third_party/googletest/src/include/gtest/gtest-printers.h
index 076c9de1f4..a91e8b8b10 100644
--- a/third_party/googletest/src/include/gtest/gtest-printers.h
+++ b/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -95,7 +94,9 @@
 // being defined as many user-defined container types don't have
 // value_type.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
@@ -257,12 +258,10 @@ struct ConvertibleToStringViewPrinter {
 #endif
 };
 
-
 // Prints the given number of bytes in the given object to the given
 // ostream.
 GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
+                                     size_t count, ::std::ostream* os);
 struct RawBytesPrinter {
   // SFINAE on `sizeof` to make sure we have a complete type.
   template <typename T, size_t = sizeof(T)>
@@ -360,7 +359,7 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
 #endif
@@ -375,12 +374,12 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
 // to point to a NUL-terminated string, and thus can print it as a string.
 
 #define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
+  template <>                                                            \
+  class FormatForComparison<CharType*, OtherStringType> {                \
+   public:                                                               \
+    static ::std::string Format(CharType* value) {                       \
+      return ::testing::PrintToString(value);                            \
+    }                                                                    \
   }
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
@@ -410,8 +409,8 @@ GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
+std::string FormatForComparisonFailureMessage(const T1& value,
+                                              const T2& /* other_operand */) {
   return FormatForComparison<T1, T2>::Format(value);
 }
 
@@ -479,6 +478,12 @@ inline void PrintTo(char8_t c, ::std::ostream* os) {
 }
 #endif
 
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os);
+GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os);
+#endif  // __SIZEOF_INT128__
+
 // Overloads for C strings.
 GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
 inline void PrintTo(char* s, ::std::ostream* os) {
@@ -545,7 +550,7 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
 }
 
 // Overloads for ::std::string.
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os);
 inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
   PrintStringTo(s, os);
 }
@@ -572,7 +577,7 @@ inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
 
 // Overloads for ::std::wstring.
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os);
 inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
   PrintWideStringTo(s, os);
 }
@@ -587,6 +592,12 @@ inline void PrintTo(internal::StringView sp, ::std::ostream* os) {
 
 inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
 
+#if GTEST_HAS_RTTI
+inline void PrintTo(const std::type_info& info, std::ostream* os) {
+  *os << internal::GetTypeName(info);
+}
+#endif  // GTEST_HAS_RTTI
+
 template <typename T>
 void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
   UniversalPrinter<T&>::Print(ref.get(), os);
@@ -744,6 +755,14 @@ class UniversalPrinter<Optional<T>> {
   }
 };
 
+template <>
+class UniversalPrinter<decltype(Nullopt())> {
+ public:
+  static void Print(decltype(Nullopt()), ::std::ostream* os) {
+    *os << "(nullopt)";
+  }
+};
+
 #endif  // GTEST_INTERNAL_HAS_OPTIONAL
 
 #if GTEST_INTERNAL_HAS_VARIANT
@@ -802,8 +821,8 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
   }
 }
 // This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const char* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const char* begin, size_t len,
+                                    ::std::ostream* os);
 
 #ifdef __cpp_char8_t
 // This overload prints a (const) char8_t array compactly.
@@ -820,8 +839,8 @@ GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
                                     ::std::ostream* os);
 
 // This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const wchar_t* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len,
+                                    ::std::ostream* os);
 
 // Implements printing an array type T[N].
 template <typename T, size_t N>
@@ -980,10 +999,10 @@ void UniversalPrint(const T& value, ::std::ostream* os) {
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector< ::std::string> Strings;
+typedef ::std::vector<::std::string> Strings;
 
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
 template <typename Tuple>
 void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
                                Strings*) {}
diff --git a/third_party/googletest/src/include/gtest/gtest-spi.h b/third_party/googletest/src/include/gtest/gtest-spi.h
index eacef44669..bec8c4810b 100644
--- a/third_party/googletest/src/include/gtest/gtest-spi.h
+++ b/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -27,12 +27,9 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // Utilities for testing Google Test itself and code that uses Google Test
 // (e.g. frameworks built on top of Google Test).
 
-// GOOGLETEST_CM0004 DO NOT DELETE
-
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 
@@ -88,7 +85,10 @@ class GTEST_API_ ScopedFakeTestPartResultReporter
   TestPartResultReporterInterface* old_reporter_;
   TestPartResultArray* const result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+  ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) =
+      delete;
+  ScopedFakeTestPartResultReporter& operator=(
+      const ScopedFakeTestPartResultReporter&) = delete;
 };
 
 namespace internal {
@@ -104,12 +104,14 @@ class GTEST_API_ SingleFailureChecker {
   SingleFailureChecker(const TestPartResultArray* results,
                        TestPartResult::Type type, const std::string& substr);
   ~SingleFailureChecker();
+
  private:
   const TestPartResultArray* const results_;
   const TestPartResult::Type type_;
   const std::string substr_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+  SingleFailureChecker(const SingleFailureChecker&) = delete;
+  SingleFailureChecker& operator=(const SingleFailureChecker&) = delete;
 };
 
 }  // namespace internal
@@ -119,7 +121,8 @@ class GTEST_API_ SingleFailureChecker {
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
 // A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures.  It verifies that the given
+// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but
+// not a non-fatal failure, as from EXPECT_EQ).  It verifies that the given
 // statement will cause exactly one fatal Google Test failure with 'substr'
 // being part of the failure message.
 //
@@ -141,44 +144,46 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // helper macro, due to some peculiarity in how the preprocessor
 // works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
 // gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE(statement, substr)                               \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::                       \
+              INTERCEPT_ONLY_CURRENT_THREAD,                                  \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr)                \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 // A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures.  It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
+// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ,
+// but not from an ASSERT_EQ). It asserts that the given statement will cause
+// exactly one non-fatal Google Test failure with 'substr' being part of the
+// failure message.
 //
 // There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
 // affects and considers failures generated in the current thread and
@@ -207,32 +212,37 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // instead of
 //   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 // to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
+#define EXPECT_NONFATAL_FAILURE(statement, substr)                    \
+  do {                                                                \
+    ::testing::TestPartResultArray gtest_failures;                    \
+    ::testing::internal::SingleFailureChecker gtest_checker(          \
         &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+        (substr));                                                    \
+    {                                                                 \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(     \
+          ::testing::ScopedFakeTestPartResultReporter::               \
+              INTERCEPT_ONLY_CURRENT_THREAD,                          \
+          &gtest_failures);                                           \
+      if (::testing::internal::AlwaysTrue()) {                        \
+        statement;                                                    \
+      }                                                               \
+    }                                                                 \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr)             \
+  do {                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure,         \
+        (substr));                                                            \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
           ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+          &gtest_failures);                                                   \
+      if (::testing::internal::AlwaysTrue()) {                                \
+        statement;                                                            \
+      }                                                                       \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-test-part.h b/third_party/googletest/src/include/gtest/gtest-test-part.h
index 203fdf98c6..09cc8c34f0 100644
--- a/third_party/googletest/src/include/gtest/gtest-test-part.h
+++ b/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -26,14 +26,17 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 
 #include <iosfwd>
 #include <vector>
+
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 
@@ -142,7 +145,8 @@ class GTEST_API_ TestPartResultArray {
  private:
   std::vector<TestPartResult> array_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+  TestPartResultArray(const TestPartResultArray&) = delete;
+  TestPartResultArray& operator=(const TestPartResultArray&) = delete;
 };
 
 // This interface knows how to report a test part result.
@@ -168,11 +172,13 @@ class GTEST_API_ HasNewFatalFailureHelper
   ~HasNewFatalFailureHelper() override;
   void ReportTestPartResult(const TestPartResult& result) override;
   bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
  private:
   bool has_new_fatal_failure_;
   TestPartResultReporterInterface* original_reporter_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+  HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete;
+  HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete;
 };
 
 }  // namespace internal
diff --git a/third_party/googletest/src/include/gtest/gtest-typed-test.h b/third_party/googletest/src/include/gtest/gtest-typed-test.h
index 9fdc6be10d..bd35a32660 100644
--- a/third_party/googletest/src/include/gtest/gtest-typed-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -27,7 +27,9 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
@@ -190,7 +192,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   typedef ::testing::internal::GenerateTypeList<Types>::type            \
       GTEST_TYPE_PARAMS_(CaseName);                                     \
   typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
-      GTEST_NAME_GENERATOR_(CaseName)
+  GTEST_NAME_GENERATOR_(CaseName)
 
 #define TYPED_TEST(CaseName, TestName)                                        \
   static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1,                       \
@@ -256,7 +258,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 // #included in multiple translation units linked together.
 #define TYPED_TEST_SUITE_P(SuiteName)              \
   static ::testing::internal::TypedTestSuitePState \
-      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+  GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
 
 // Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
@@ -301,21 +303,21 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   REGISTER_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)       \
-  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                       \
-                "test-suit-prefix must not be empty");                      \
-  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =        \
-      ::testing::internal::TypeParameterizedTestSuite<                      \
-          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,    \
-          ::testing::internal::GenerateTypeList<Types>::type>::             \
-          Register(GTEST_STRINGIFY_(Prefix),                                \
-                   ::testing::internal::CodeLocation(__FILE__, __LINE__),   \
-                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),             \
-                   GTEST_STRINGIFY_(SuiteName),                             \
-                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),                 \
-                   ::testing::internal::GenerateNames<                      \
-                       ::testing::internal::NameGeneratorSelector<          \
-                           __VA_ARGS__>::type,                              \
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)     \
+  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                     \
+                "test-suit-prefix must not be empty");                    \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =      \
+      ::testing::internal::TypeParameterizedTestSuite<                    \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,  \
+          ::testing::internal::GenerateTypeList<Types>::type>::           \
+          Register(GTEST_STRINGIFY_(Prefix),                              \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),           \
+                   GTEST_STRINGIFY_(SuiteName),                           \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),               \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
                        ::testing::internal::GenerateTypeList<Types>::type>())
 
 // Legacy API is deprecated but still available
diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h
index 7a5d057c4a..d19a587a18 100644
--- a/third_party/googletest/src/include/gtest/gtest.h
+++ b/third_party/googletest/src/include/gtest/gtest.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
@@ -47,8 +46,6 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
-
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 
@@ -59,31 +56,22 @@
 #include <type_traits>
 #include <vector>
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-assertion-result.h"
 #include "gtest/gtest-death-test.h"
 #include "gtest/gtest-matchers.h"
 #include "gtest/gtest-message.h"
 #include "gtest/gtest-param-test.h"
 #include "gtest/gtest-printers.h"
-#include "gtest/gtest_prod.h"
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest-typed-test.h"
+#include "gtest/gtest_pred_impl.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
-namespace testing {
-
-// Silence C4100 (unreferenced formal parameter) and 4805
-// unsafe mix of type 'const int' and type 'const bool'
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4805)
-# pragma warning(disable:4100)
-#endif
-
-
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -138,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed);
 // is 1. If the value is -1 the tests are repeating forever.
 GTEST_DECLARE_int32_(repeat);
 
+// This flag controls whether Google Test Environments are recreated for each
+// repeat of the tests. The default value is true. If set to false the global
+// test Environment objects are only set up once, for the first iteration, and
+// only torn down once, for the last.
+GTEST_DECLARE_bool_(recreate_environments_when_repeating);
+
 // This flag controls whether Google Test includes Google Test internal
 // stack frames in failure stack traces.
 GTEST_DECLARE_bool_(show_internal_stack_frames);
@@ -163,6 +157,16 @@ GTEST_DECLARE_string_(stream_result_to);
 GTEST_DECLARE_string_(flagfile);
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -201,193 +205,6 @@ using TestCase = TestSuite;
 class TestInfo;
 class UnitTest;
 
-// A class for indicating whether an assertion was successful.  When
-// the assertion wasn't successful, the AssertionResult object
-// remembers a non-empty message that describes how it failed.
-//
-// To create an instance of this class, use one of the factory functions
-// (AssertionSuccess() and AssertionFailure()).
-//
-// This class is useful for two purposes:
-//   1. Defining predicate functions to be used with Boolean test assertions
-//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
-//   2. Defining predicate-format functions to be
-//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
-//
-// For example, if you define IsEven predicate:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
-// will print the message
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false (5 is odd)
-//   Expected: true
-//
-// instead of a more opaque
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false
-//   Expected: true
-//
-// in case IsEven is a simple Boolean predicate.
-//
-// If you expect your predicate to be reused and want to support informative
-// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
-// about half as often as positive ones in our tests), supply messages for
-// both success and failure cases:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess() << n << " is even";
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
-//
-//   Value of: IsEven(Fib(6))
-//     Actual: true (8 is even)
-//   Expected: false
-//
-// NB: Predicates that support negative Boolean assertions have reduced
-// performance in positive ones so be careful not to use them in tests
-// that have lots (tens of thousands) of positive Boolean assertions.
-//
-// To use this class with EXPECT_PRED_FORMAT assertions such as:
-//
-//   // Verifies that Foo() returns an even number.
-//   EXPECT_PRED_FORMAT1(IsEven, Foo());
-//
-// you need to define:
-//
-//   testing::AssertionResult IsEven(const char* expr, int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure()
-//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
-//   }
-//
-// If Foo() returns 5, you will see the following message:
-//
-//   Expected: Foo() is even
-//     Actual: it's 5
-//
-class GTEST_API_ AssertionResult {
- public:
-  // Copy constructor.
-  // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult& other);
-
-// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
-// This warning is not emitted in Visual Studio 2017.
-// This warning is off by default starting in Visual Studio 2019 but can be
-// enabled with command-line options.
-#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
-#endif
-
-  // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  //
-  // T must be contextually convertible to bool.
-  //
-  // The second parameter prevents this overload from being considered if
-  // the argument is implicitly convertible to AssertionResult. In that case
-  // we want AssertionResult's copy constructor to be used.
-  template <typename T>
-  explicit AssertionResult(
-      const T& success,
-      typename std::enable_if<
-          !std::is_convertible<T, AssertionResult>::value>::type*
-      /*enabler*/
-      = nullptr)
-      : success_(success) {}
-
-#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif
-
-  // Assignment operator.
-  AssertionResult& operator=(AssertionResult other) {
-    swap(other);
-    return *this;
-  }
-
-  // Returns true if and only if the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_.get() != nullptr ? message_->c_str() : "";
-  }
-  // Deprecated; please use message() instead.
-  const char* failure_message() const { return message(); }
-
-  // Streams a custom failure message into this object.
-  template <typename T> AssertionResult& operator<<(const T& value) {
-    AppendMessage(Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
-    AppendMessage(Message() << basic_manipulator);
-    return *this;
-  }
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const Message& a_message) {
-    if (message_.get() == nullptr) message_.reset(new ::std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  // Swap the contents of this AssertionResult with other.
-  void swap(AssertionResult& other);
-
-  // Stores result of the assertion predicate.
-  bool success_;
-  // Stores the message describing the condition in case the expectation
-  // construct is not satisfied with the predicate's outcome.
-  // Referenced via a pointer to avoid taking too much stack frame space
-  // with test assertions.
-  std::unique_ptr< ::std::string> message_;
-};
-
-// Makes a successful assertion result.
-GTEST_API_ AssertionResult AssertionSuccess();
-
-// Makes a failed assertion result.
-GTEST_API_ AssertionResult AssertionFailure();
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
-
-}  // namespace testing
-
-// Includes the auto-generated header that implements a family of generic
-// predicate assertion macros. This include comes late because it relies on
-// APIs declared above.
-#include "gtest/gtest_pred_impl.h"
-
-namespace testing {
-
 // The abstract class that all tests inherit from.
 //
 // In Google Test, a unit test program contains one or many TestSuites, and
@@ -522,7 +339,8 @@ class GTEST_API_ Test {
   virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
 
   // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+  Test(const Test&) = delete;
+  Test& operator=(const Test&) = delete;
 };
 
 typedef internal::TimeInMillis TimeInMillis;
@@ -536,24 +354,17 @@ class TestProperty {
   // C'tor.  TestProperty does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
+  TestProperty(const std::string& a_key, const std::string& a_value)
+      : key_(a_key), value_(a_value) {}
 
   // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
+  const char* key() const { return key_.c_str(); }
 
   // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
+  const char* value() const { return value_.c_str(); }
 
   // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
+  void SetValue(const std::string& new_value) { value_ = new_value; }
 
  private:
   // The key supplied by the user.
@@ -687,7 +498,8 @@ class GTEST_API_ TestResult {
   TimeInMillis elapsed_time_;
 
   // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+  TestResult(const TestResult&) = delete;
+  TestResult& operator=(const TestResult&) = delete;
 };  // class TestResult
 
 // A TestInfo object stores the following information about a test:
@@ -811,8 +623,8 @@ class GTEST_API_ TestInfo {
   }
 
   // These fields are immutable properties of the test.
-  const std::string test_suite_name_;    // test suite name
-  const std::string name_;               // Test name
+  const std::string test_suite_name_;  // test suite name
+  const std::string name_;             // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
   const std::unique_ptr<const ::std::string> type_param_;
@@ -833,7 +645,8 @@ class GTEST_API_ TestInfo {
   // test for the second time.
   TestResult result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+  TestInfo(const TestInfo&) = delete;
+  TestInfo& operator=(const TestInfo&) = delete;
 };
 
 // A test suite, which consists of a vector of TestInfos.
@@ -941,7 +754,7 @@ class GTEST_API_ TestSuite {
 
   // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
   // destruction of the TestSuite object.
-  void AddTestInfo(TestInfo * test_info);
+  void AddTestInfo(TestInfo* test_info);
 
   // Clears the results of all tests in this test suite.
   void ClearResult();
@@ -1042,7 +855,8 @@ class GTEST_API_ TestSuite {
   TestResult ad_hoc_test_result_;
 
   // We disallow copying TestSuites.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
+  TestSuite(const TestSuite&) = delete;
+  TestSuite& operator=(const TestSuite&) = delete;
 };
 
 // An Environment object is capable of setting up and tearing down an
@@ -1069,6 +883,7 @@ class Environment {
 
   // Override this to define how to tear down the environment.
   virtual void TearDown() {}
+
  private:
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
@@ -1120,6 +935,9 @@ class TestEventListener {
   // Fired before the test starts.
   virtual void OnTestStart(const TestInfo& test_info) = 0;
 
+  // Fired when a test is disabled
+  virtual void OnTestDisabled(const TestInfo& /*test_info*/) {}
+
   // Fired after a failed assertion or a SUCCEED() invocation.
   // If you want to throw an exception from this function to skip to the next
   // TEST, it must be AssertionException defined above, or inherited from it.
@@ -1143,8 +961,7 @@ class TestEventListener {
   virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
 
   // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest& unit_test,
-                                  int iteration) = 0;
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0;
 
   // Fired after all test activities have ended.
   virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
@@ -1169,6 +986,7 @@ class EmptyTestEventListener : public TestEventListener {
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
   void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
   void OnTestEnd(const TestInfo& /*test_info*/) override {}
   void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
@@ -1258,7 +1076,8 @@ class GTEST_API_ TestEventListeners {
   TestEventListener* default_xml_generator_;
 
   // We disallow copying TestEventListeners.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+  TestEventListeners(const TestEventListeners&) = delete;
+  TestEventListeners& operator=(const TestEventListeners&) = delete;
 };
 
 // A UnitTest consists of a vector of TestSuites.
@@ -1301,8 +1120,7 @@ class GTEST_API_ UnitTest {
 
   // Returns the TestInfo object for the test that's currently running,
   // or NULL if no test is running.
-  const TestInfo* current_test_info() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
@@ -1408,8 +1226,7 @@ class GTEST_API_ UnitTest {
   // eventually call this to report their results.  The user code
   // should use the assertion macros instead of calling this directly.
   void AddTestPartResult(TestPartResult::Type result_type,
-                         const char* file_name,
-                         int line_number,
+                         const char* file_name, int line_number,
                          const std::string& message,
                          const std::string& os_stack_trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
@@ -1440,8 +1257,7 @@ class GTEST_API_ UnitTest {
   friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites();
   friend internal::UnitTestImpl* internal::GetUnitTestImpl();
   friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type,
-      const std::string& message);
+      TestPartResult::Type result_type, const std::string& message);
 
   // Creates an empty UnitTest.
   UnitTest();
@@ -1455,8 +1271,7 @@ class GTEST_API_ UnitTest {
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace()
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Protects mutable state in *impl_.  This is mutable as some const
   // methods need to lock it too.
@@ -1469,7 +1284,8 @@ class GTEST_API_ UnitTest {
   internal::UnitTestImpl* impl_;
 
   // We disallow copying UnitTest.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+  UnitTest(const UnitTest&) = delete;
+  UnitTest& operator=(const UnitTest&) = delete;
 };
 
 // A convenient wrapper for adding an environment for the test
@@ -1520,13 +1336,11 @@ namespace internal {
 // when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQFailure(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const T1& lhs, const T2& rhs) {
-  return EqFailure(lhs_expression,
-                   rhs_expression,
+                                   const char* rhs_expression, const T1& lhs,
+                                   const T2& rhs) {
+  return EqFailure(lhs_expression, rhs_expression,
                    FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
 }
 
 // This block of code defines operator==/!=
@@ -1539,8 +1353,7 @@ inline bool operator!=(faketype, faketype) { return false; }
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            const T1& lhs,
+                            const char* rhs_expression, const T1& lhs,
                             const T2& rhs) {
   if (lhs == rhs) {
     return AssertionSuccess();
@@ -1571,8 +1384,7 @@ class EqHelper {
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
   static AssertionResult Compare(const char* lhs_expression,
-                                 const char* rhs_expression,
-                                 BiggestInt lhs,
+                                 const char* rhs_expression, BiggestInt lhs,
                                  BiggestInt rhs) {
     return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
@@ -1607,16 +1419,16 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   const T1& val1, const T2& val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
-  }\
-}
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                \
+  template <typename T1, typename T2>                                      \
+  AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                     const T1& val1, const T2& val2) {     \
+    if (val1 op val2) {                                                    \
+      return AssertionSuccess();                                           \
+    } else {                                                               \
+      return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);            \
+    }                                                                      \
+  }
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
@@ -1638,49 +1450,42 @@ GTEST_IMPL_CMP_HELPER_(GT, >)
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
+                                              const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
+                                              const char* s1, const char* s2);
 
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 }  // namespace internal
 
@@ -1692,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 //
 // The {needle,haystack}_expr arguments are the stringified
 // expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const char* needle,
+                                       const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const wchar_t* needle,
+                                       const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const char* needle,
+                                          const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const wchar_t* needle,
+                                          const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::string& needle,
+                                       const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::string& needle,
+                                          const ::std::string& haystack);
 
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::wstring& needle,
+                                       const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::wstring& needle,
+                                          const ::std::wstring& haystack);
 #endif  // GTEST_HAS_STD_WSTRING
 
 namespace internal {
@@ -1732,8 +1545,7 @@ namespace internal {
 template <typename RawType>
 AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
                                          const char* rhs_expression,
-                                         RawType lhs_value,
-                                         RawType rhs_value) {
+                                         RawType lhs_value, RawType rhs_value) {
   const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
@@ -1748,10 +1560,8 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
   rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
          << rhs_value;
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   StringStreamToString(&lhs_ss),
-                   StringStreamToString(&rhs_ss),
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -1761,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
 GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
                                                 const char* expr2,
                                                 const char* abs_error_expr,
-                                                double val1,
-                                                double val2,
+                                                double val1, double val2,
                                                 double abs_error);
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -1770,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
 class GTEST_API_ AssertHelper {
  public:
   // Constructor.
-  AssertHelper(TestPartResult::Type type,
-               const char* file,
-               int line,
+  AssertHelper(TestPartResult::Type type, const char* file, int line,
                const char* message);
   ~AssertHelper();
 
@@ -1786,11 +1593,9 @@ class GTEST_API_ AssertHelper {
   // re-using stack space even for temporary variables, so every EXPECT_EQ
   // reserves stack space for another AssertHelper.
   struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t,
-                     const char* srcfile,
-                     int line_num,
+    AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num,
                      const char* msg)
-        : type(t), file(srcfile), line(line_num), message(msg) { }
+        : type(t), file(srcfile), line(line_num), message(msg) {}
 
     TestPartResult::Type const type;
     const char* const file;
@@ -1798,12 +1603,14 @@ class GTEST_API_ AssertHelper {
     std::string const message;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+    AssertHelperData(const AssertHelperData&) = delete;
+    AssertHelperData& operator=(const AssertHelperData&) = delete;
   };
 
   AssertHelperData* const data_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+  AssertHelper(const AssertHelper&) = delete;
+  AssertHelper& operator=(const AssertHelper&) = delete;
 };
 
 }  // namespace internal
@@ -1860,15 +1667,14 @@ class WithParamInterface {
  private:
   // Sets parameter value. The caller is responsible for making sure the value
   // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
+  static void SetParam(const ParamType* parameter) { parameter_ = parameter; }
 
   // Static value used for accessing parameter during a test lifetime.
   static const ParamType* parameter_;
 
   // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
+  template <class TestClass>
+  friend class internal::ParameterizedTestFactory;
 };
 
 template <typename T>
@@ -1878,8 +1684,7 @@ const T* WithParamInterface<T>::parameter_ = nullptr;
 // WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
 template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
+class TestWithParam : public Test, public WithParamInterface<T> {};
 
 // Macros for indicating success/failure in test code.
 
@@ -1910,7 +1715,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 
 // Generates a nonfatal failure at the given source file location with
 // a generic message.
-#define ADD_FAILURE_AT(file, line) \
+#define ADD_FAILURE_AT(file, line)        \
   GTEST_MESSAGE_AT_(file, line, "Failed", \
                     ::testing::TestPartResult::kNonFatalFailure)
 
@@ -1925,7 +1730,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of FAIL(), which is a
 // generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
+#define FAIL() GTEST_FAIL()
 #endif
 
 // Generates a success with a generic message.
@@ -1934,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of SUCCEED(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
+#define SUCCEED() GTEST_SUCCEED()
 #endif
 
 // Macros for testing exceptions.
@@ -1962,16 +1767,15 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define GTEST_EXPECT_TRUE(condition) \
+#define GTEST_EXPECT_TRUE(condition)                      \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define GTEST_EXPECT_FALSE(condition) \
+#define GTEST_EXPECT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
 #define GTEST_ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define GTEST_ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
@@ -2070,27 +1874,27 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // ASSERT_XY(), which clashes with some users' own code.
 
 #if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
 #endif
 
 // C-string Comparisons.  All tests treat NULL and any non-NULL string
@@ -2115,7 +1919,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define EXPECT_STRCASEEQ(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define EXPECT_STRCASENE(s1, s2)\
+#define EXPECT_STRCASENE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 #define ASSERT_STREQ(s1, s2) \
@@ -2124,7 +1928,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define ASSERT_STRCASEEQ(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define ASSERT_STRCASENE(s1, s2)\
+#define ASSERT_STRCASENE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
@@ -2141,29 +1945,29 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(val1, val2)\
+#define EXPECT_FLOAT_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define EXPECT_DOUBLE_EQ(val1, val2)\
+#define EXPECT_DOUBLE_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define ASSERT_FLOAT_EQ(val1, val2)\
+#define ASSERT_FLOAT_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define ASSERT_DOUBLE_EQ(val1, val2)\
+#define ASSERT_DOUBLE_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define EXPECT_NEAR(val1, val2, abs_error)\
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define EXPECT_NEAR(val1, val2, abs_error)                                   \
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
-#define ASSERT_NEAR(val1, val2, abs_error)\
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define ASSERT_NEAR(val1, val2, abs_error)                                   \
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
 // These predicate format functions work on floating-point values, and
 // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
@@ -2177,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
 GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
                                     double val1, double val2);
 
-
 #if GTEST_OS_WINDOWS
 
 // Macros that test for HRESULT failure and success, these are only useful
@@ -2189,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // expected result and the actual result with both a human-readable
 // string representation of the error, if available, as well as the
 // hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define EXPECT_HRESULT_FAILED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define EXPECT_HRESULT_FAILED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
-# define ASSERT_HRESULT_FAILED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define ASSERT_HRESULT_FAILED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
 #endif  // GTEST_OS_WINDOWS
 
@@ -2214,9 +2017,9 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 //   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
 //
 #define ASSERT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
 #define EXPECT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
 
 // Causes a trace (including the given source file path and line number,
 // and the given message) to be included in every test failure message generated
@@ -2258,7 +2061,8 @@ class GTEST_API_ ScopedTrace {
  private:
   void PushTrace(const char* file, int line, std::string message);
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+  ScopedTrace(const ScopedTrace&) = delete;
+  ScopedTrace& operator=(const ScopedTrace&) = delete;
 } GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
                             // c'tor and d'tor.  Therefore it doesn't
                             // need to be used otherwise.
@@ -2278,9 +2082,9 @@ class GTEST_API_ ScopedTrace {
 // Assuming that each thread maintains its own stack of traces.
 // Therefore, a SCOPED_TRACE() would (correctly) only affect the
 // assertions in its own thread.
-#define SCOPED_TRACE(message) \
-  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, (message))
+#define SCOPED_TRACE(message)                                         \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+      __FILE__, __LINE__, (message))
 
 // Compile-time assertion for type equality.
 // StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
@@ -2378,20 +2182,19 @@ constexpr bool StaticAssertTypeEq() noexcept {
 //     EXPECT_EQ(a_.size(), 0);
 //     EXPECT_EQ(b_.size(), 1);
 //   }
-//
-// GOOGLETEST_CM0011 DO NOT DELETE
-#if !GTEST_DONT_DEFINE_TEST
-#define TEST_F(test_fixture, test_name)\
+#define GTEST_TEST_F(test_fixture, test_name)        \
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
-#endif  // !GTEST_DONT_DEFINE_TEST
+#if !GTEST_DONT_DEFINE_TEST_F
+#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name)
+#endif
 
 // Returns a path to temporary directory.
 // Tries to determine an appropriate directory for the platform.
 GTEST_API_ std::string TempDir();
 
 #ifdef _MSC_VER
-#  pragma warning(pop)
+#pragma warning(pop)
 #endif
 
 // Dynamically registers a test with the framework.
@@ -2445,6 +2248,7 @@ GTEST_API_ std::string TempDir();
 // }
 // ...
 // int main(int argc, char** argv) {
+//   ::testing::InitGoogleTest(&argc, argv);
 //   std::vector<int> values_to_test = LoadValuesFromConfig();
 //   RegisterMyTests(values_to_test);
 //   ...
@@ -2486,9 +2290,7 @@ TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
 // namespace and has an all-caps name.
 int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
 
-inline int RUN_ALL_TESTS() {
-  return ::testing::UnitTest::GetInstance()->Run();
-}
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
diff --git a/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/third_party/googletest/src/include/gtest/gtest_pred_impl.h
index 5029a9bb02..47a24aa687 100644
--- a/third_party/googletest/src/include/gtest/gtest_pred_impl.h
+++ b/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -26,17 +26,19 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
 // Implements a family of generic predicate assertion macros.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-#include "gtest/gtest.h"
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
 
 namespace testing {
 
@@ -72,22 +74,18 @@ namespace testing {
 // GTEST_ASSERT_ is the basic statement to which all of the assertions
 // in this file reduce.  Don't use this in your code.
 
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+#define GTEST_ASSERT_(expression, on_failure)                   \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                 \
   if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
+    ;                                                           \
+  else                                                          \
     on_failure(gtest_ar.failure_message())
 
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text, const char* e1,
+                                  Pred pred, const T1& v1) {
   if (pred(v1)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -98,40 +96,27 @@ AssertionResult AssertPred1Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
+#define GTEST_PRED1_(pred, v1, on_failure) \
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
 
 // Unary predicate assertion macros.
 #define EXPECT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
 #define ASSERT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
 
 // Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text, const char* e1,
+                                  const char* e2, Pred pred, const T1& v1,
                                   const T2& v2) {
   if (pred(v1, v2)) return AssertionSuccess();
 
@@ -145,19 +130,14 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
+#define GTEST_PRED2_(pred, v1, v2, on_failure)                               \
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+                on_failure)
 
 // Binary predicate assertion macros.
 #define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
@@ -169,22 +149,12 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 #define ASSERT_PRED2(pred, v1, v2) \
   GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3) {
   if (pred(v1, v2, v3)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -198,21 +168,15 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)                          \
+  GTEST_ASSERT_(                                                            \
+      ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+      on_failure)
 
 // Ternary predicate assertion macros.
 #define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
@@ -224,25 +188,13 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 #define ASSERT_PRED3(pred, v1, v2, v3) \
   GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, Pred pred, const T1& v1,
+                                  const T2& v2, const T3& v3, const T4& v4) {
   if (pred(v1, v2, v3, v4)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -257,23 +209,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)                        \
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+                                             v1, v2, v3, v4),                 \
+                on_failure)
 
 // 4-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
@@ -285,28 +229,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 #define ASSERT_PRED4(pred, v1, v2, v3, v4) \
   GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
           typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
+AssertionResult AssertPred5Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, const char* e5, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3,
+                                  const T4& v4, const T5& v5) {
   if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -322,25 +253,16 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)  \
   GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)                   \
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+                                             pred, v1, v2, v3, v4, v5),      \
+                on_failure)
 
 // 5-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
@@ -352,8 +274,6 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 #define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
   GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
 
-
-
 }  // namespace testing
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/third_party/googletest/src/include/gtest/gtest_prod.h b/third_party/googletest/src/include/gtest/gtest_prod.h
index 38b9d85a51..1f37dc31c3 100644
--- a/third_party/googletest/src/include/gtest/gtest_prod.h
+++ b/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -27,9 +27,8 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
-// Google C++ Testing and Mocking Framework definitions useful in production code.
-// GOOGLETEST_CM0003 DO NOT DELETE
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code.
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
@@ -55,7 +54,7 @@
 // Note: The test class must be in the same namespace as the class being tested.
 // For example, putting MyClassTest in an anonymous namespace will not work.
 
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
+#define FRIEND_TEST(test_case_name, test_name) \
+  friend class test_case_name##_##test_name##_Test
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/third_party/googletest/src/include/gtest/internal/custom/README.md b/third_party/googletest/src/include/gtest/internal/custom/README.md
index ff391fb4e2..cb49e2c754 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/README.md
+++ b/third_party/googletest/src/include/gtest/internal/custom/README.md
@@ -15,18 +15,6 @@ The custom directory is an injection point for custom user configurations.
 
 The following macros can be defined:
 
-### Flag related macros:
-
-*   `GTEST_FLAG(flag_name)`
-*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
-    own flagfile flag parsing.
-*   `GTEST_DECLARE_bool_(name)`
-*   `GTEST_DECLARE_int32_(name)`
-*   `GTEST_DECLARE_string_(name)`
-*   `GTEST_DEFINE_bool_(name, default_val, doc)`
-*   `GTEST_DEFINE_int32_(name, default_val, doc)`
-*   `GTEST_DEFINE_string_(name, default_val, doc)`
-
 ### Logging:
 
 *   `GTEST_LOG_(severity)`
diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
index db02881c0c..9b7fb4261a 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
+++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -34,4 +34,35 @@
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 
+// Use a stub Notification class.
+//
+// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and
+// std::condition_variable. The <mutex> and <condition_variable> headers of
+// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only
+// when configured with the posix threads option but don't define them when
+// configured with the win32 threads option. The Notification class is only
+// used in GoogleTest's internal tests. Since we don't build GoogleTest's
+// internal tests, we don't need a working Notification class. Although it's
+// not hard to fix the mingw32 g++ compilation errors by implementing the
+// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE,
+// it's simpler to just use a stub Notification class on all platforms.
+//
+// The default constructor of the stub class is deleted and the declaration of
+// the Notify() method is commented out, so that compilation will fail if any
+// code actually uses the Notification class.
+
+#define GTEST_HAS_NOTIFICATION_ 1
+namespace testing {
+namespace internal {
+class Notification {
+ public:
+  Notification() = delete;
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
+  // void Notify();
+  void WaitForNotification() {}
+};
+}  // namespace internal
+}  // namespace testing
+
 #endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
index 490296dfad..45580ae805 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -26,27 +26,31 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines internal utilities needed for implementing
 // death tests.  They are subject to change without notice.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 
+#include <stdio.h>
+
+#include <memory>
+
 #include "gtest/gtest-matchers.h"
 #include "gtest/internal/gtest-internal.h"
 
-#include <stdio.h>
-#include <memory>
+GTEST_DECLARE_string_(internal_run_death_test);
 
 namespace testing {
 namespace internal {
 
-GTEST_DECLARE_string_(internal_run_death_test);
-
 // Names of the flags (needed for parsing Google Test flags).
 const char kDeathTestStyleFlag[] = "death_test_style";
 const char kDeathTestUseFork[] = "death_test_use_fork";
@@ -83,16 +87,18 @@ class GTEST_API_ DeathTest {
   static bool Create(const char* statement, Matcher<const std::string&> matcher,
                      const char* file, int line, DeathTest** test);
   DeathTest();
-  virtual ~DeathTest() { }
+  virtual ~DeathTest() {}
 
   // A helper class that aborts a death test when it's deleted.
   class ReturnSentinel {
    public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    explicit ReturnSentinel(DeathTest* test) : test_(test) {}
     ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
    private:
     DeathTest* const test_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+    ReturnSentinel(const ReturnSentinel&) = delete;
+    ReturnSentinel& operator=(const ReturnSentinel&) = delete;
   } GTEST_ATTRIBUTE_UNUSED_;
 
   // An enumeration of possible roles that may be taken when a death
@@ -137,7 +143,8 @@ class GTEST_API_ DeathTest {
   // A string containing a description of the outcome of the last death test.
   static std::string last_death_test_message_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+  DeathTest(const DeathTest&) = delete;
+  DeathTest& operator=(const DeathTest&) = delete;
 };
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
@@ -145,7 +152,7 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
-  virtual ~DeathTestFactory() { }
+  virtual ~DeathTestFactory() {}
   virtual bool Create(const char* statement,
                       Matcher<const std::string&> matcher, const char* file,
                       int line, DeathTest** test) = 0;
@@ -186,28 +193,28 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
 
 // Traps C++ exceptions escaping statement and reports them as test
 // failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test)           \
+  try {                                                                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);               \
+  } catch (const ::std::exception& gtest_exception) {                        \
+    fprintf(                                                                 \
+        stderr,                                                              \
+        "\n%s: Caught std::exception-derived exception escaping the "        \
+        "death test statement. Exception message: %s\n",                     \
         ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
+        gtest_exception.what());                                             \
+    fflush(stderr);                                                          \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
+  } catch (...) {                                                            \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
   }
 
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 
-# endif
+#endif
 
 // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
 // ASSERT_EXIT*, and EXPECT_EXIT*.
@@ -236,8 +243,6 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
           gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
           break;                                                               \
         }                                                                      \
-        default:                                                               \
-          break;                                                               \
       }                                                                        \
     }                                                                          \
   } else                                                                       \
@@ -265,16 +270,12 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
 // RUN_ALL_TESTS was called.
 class InternalRunDeathTestFlag {
  public:
-  InternalRunDeathTestFlag(const std::string& a_file,
-                           int a_line,
-                           int an_index,
+  InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index,
                            int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
+      : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
 
   ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
+    if (write_fd_ >= 0) posix::Close(write_fd_);
   }
 
   const std::string& file() const { return file_; }
@@ -288,7 +289,8 @@ class InternalRunDeathTestFlag {
   int index_;
   int write_fd_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+  InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete;
+  InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete;
 };
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
index 0c033abc34..a2a60a962b 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Google Test filepath utilities
 //
 // This header file declares classes and functions used internally by
@@ -35,7 +35,9 @@
 // This file is #included in gtest/internal/gtest-internal.h.
 // Do not include this header file separately!
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
@@ -61,8 +63,8 @@ namespace internal {
 
 class GTEST_API_ FilePath {
  public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+  FilePath() : pathname_("") {}
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {}
 
   explicit FilePath(const std::string& pathname) : pathname_(pathname) {
     Normalize();
@@ -73,9 +75,7 @@ class GTEST_API_ FilePath {
     return *this;
   }
 
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
+  void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; }
 
   const std::string& string() const { return pathname_; }
   const char* c_str() const { return pathname_.c_str(); }
@@ -88,8 +88,7 @@ class GTEST_API_ FilePath {
   // than zero (e.g., 12), returns "dir/test_12.xml".
   // On Windows platform, uses \ as the separator rather than /.
   static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
+                               const FilePath& base_name, int number,
                                const char* extension);
 
   // Given directory = "dir", relative_path = "test.xml",
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-internal.h
index f8cbdbd81d..9b04e4c85f 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-internal.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -26,13 +26,15 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
@@ -40,19 +42,20 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #include <ctype.h>
 #include <float.h>
 #include <string.h>
+
 #include <cstdint>
 #include <iomanip>
 #include <limits>
@@ -76,7 +79,7 @@
 // the current line number.  For more details, see
 // http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
 
 // Stringifies its argument.
 // Work around a bug in visual studio which doesn't accept code like this:
@@ -98,21 +101,21 @@ namespace testing {
 
 // Forward declarations.
 
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test suites.
+class AssertionResult;  // Result of an assertion.
+class Message;          // Represents a failure message.
+class Test;             // Represents a test.
+class TestInfo;         // Information about a test.
+class TestPartResult;   // Result of a test part.
+class UnitTest;         // A collection of test suites.
 
 template <typename T>
 ::std::string PrintToString(const T& value);
 
 namespace internal {
 
-struct TraceInfo;                      // Information about a trace point.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
+struct TraceInfo;    // Information about a trace point.
+class TestInfoImpl;  // Opaque implementation of TestInfo
+class UnitTestImpl;  // Opaque implementation of UnitTest
 
 // The text used in failure messages to indicate the start of the
 // stack trace.
@@ -121,6 +124,7 @@ GTEST_API_ extern const char kStackTraceMarker[];
 // An IgnoredValue object can be implicitly constructed from ANY value.
 class IgnoredValue {
   struct Sink {};
+
  public:
   // This constructor template allows any value to be implicitly
   // converted to IgnoredValue.  The object has no data member and
@@ -136,13 +140,13 @@ class IgnoredValue {
 };
 
 // Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
-    const std::string& gtest_msg, const Message& user_msg);
+GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg,
+                                         const Message& user_msg);
 
 #if GTEST_HAS_EXCEPTIONS
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
-/* an exported class was derived from a class that was not exported */)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4275 /* an exported class was derived from a class that was not exported */)
 
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
@@ -181,14 +185,6 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
 
 }  // namespace edit_distance
 
-// Calculate the diff between 'left' and 'right' and return it in unified diff
-// format.
-// If not null, stores in 'total_line_count' the total number of lines found
-// in left + right.
-GTEST_API_ std::string DiffStrings(const std::string& left,
-                                   const std::string& right,
-                                   size_t* total_line_count);
-
 // Constructs and returns the message for an equality assertion
 // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
 //
@@ -212,10 +208,8 @@ GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value);
 
 // This template class represents an IEEE floating-point number
 // (either single-precision or double-precision, depending on the
@@ -256,11 +250,11 @@ class FloatingPoint {
   // Constants.
 
   // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
+  static const size_t kBitCount = 8 * sizeof(RawType);
 
   // # of fraction bits in a number.
   static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
+      std::numeric_limits<RawType>::digits - 1;
 
   // # of exponent bits in a number.
   static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
@@ -269,8 +263,8 @@ class FloatingPoint {
   static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
 
   // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+  static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+                                       (kExponentBitCount + 1);
 
   // The mask for the exponent bits.
   static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
@@ -309,9 +303,7 @@ class FloatingPoint {
   }
 
   // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
+  static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
 
   // Returns the maximum representable finite floating-point number.
   static RawType Max();
@@ -319,7 +311,7 @@ class FloatingPoint {
   // Non-static methods
 
   // Returns the bits that represents this number.
-  const Bits &bits() const { return u_.bits_; }
+  const Bits& bits() const { return u_.bits_; }
 
   // Returns the exponent bits of this number.
   Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
@@ -348,8 +340,8 @@ class FloatingPoint {
     // a NAN must return false.
     if (is_nan() || rhs.is_nan()) return false;
 
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+           kMaxUlps;
   }
 
  private:
@@ -374,7 +366,7 @@ class FloatingPoint {
   //
   // Read http://en.wikipedia.org/wiki/Signed_number_representations
   // for more details on signed number representations.
-  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+  static Bits SignAndMagnitudeToBiased(const Bits& sam) {
     if (kSignBitMask & sam) {
       // sam represents a negative number.
       return ~sam + 1;
@@ -386,8 +378,8 @@ class FloatingPoint {
 
   // Given two numbers in the sign-and-magnitude representation,
   // returns the distance between them as an unsigned number.
-  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
-                                                     const Bits &sam2) {
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1,
+                                                     const Bits& sam2) {
     const Bits biased1 = SignAndMagnitudeToBiased(sam1);
     const Bits biased2 = SignAndMagnitudeToBiased(sam2);
     return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
@@ -399,9 +391,13 @@ class FloatingPoint {
 // We cannot use std::numeric_limits<T>::max() as it clashes with the max()
 // macro defined by <windows.h>.
 template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+inline float FloatingPoint<float>::Max() {
+  return FLT_MAX;
+}
 template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+inline double FloatingPoint<double>::Max() {
+  return DBL_MAX;
+}
 
 // Typedefs the instances of the FloatingPoint template class that we
 // care to use.
@@ -461,7 +457,8 @@ class TestFactoryBase {
   TestFactoryBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+  TestFactoryBase(const TestFactoryBase&) = delete;
+  TestFactoryBase& operator=(const TestFactoryBase&) = delete;
 };
 
 // This class provides implementation of TeastFactoryBase interface.
@@ -510,11 +507,11 @@ inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
 
 template <typename T>
 //  Note that SuiteApiResolver inherits from T because
-//  SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
+//  SetUpTestSuite()/TearDownTestSuite() could be protected. This way
 //  SuiteApiResolver can access them.
 struct SuiteApiResolver : T {
   // testing::Test is only forward declared at this point. So we make it a
-  // dependend class for the compiler to be OK with it.
+  // dependent class for the compiler to be OK with it.
   using Test =
       typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
 
@@ -654,7 +651,8 @@ inline const char* SkipComma(const char* str) {
   if (comma == nullptr) {
     return nullptr;
   }
-  while (IsSpace(*(++comma))) {}
+  while (IsSpace(*(++comma))) {
+  }
   return comma;
 }
 
@@ -668,7 +666,7 @@ inline std::string GetPrefixUntilComma(const char* str) {
 // Splits a given string on a given delimiter, populating a given
 // vector with the fields.
 void SplitString(const ::std::string& str, char delimiter,
-                 ::std::vector< ::std::string>* dest);
+                 ::std::vector<::std::string>* dest);
 
 // The default argument to the template below for the case when the user does
 // not provide a name generator.
@@ -781,13 +779,13 @@ class TypeParameterizedTestSuite {
                        const std::vector<std::string>& type_names =
                            GenerateNames<DefaultNameGenerator, Types>()) {
     RegisterTypeParameterizedTestSuiteInstantiation(case_name);
-    std::string test_name = StripTrailingSpaces(
-        GetPrefixUntilComma(test_names));
+    std::string test_name =
+        StripTrailingSpaces(GetPrefixUntilComma(test_names));
     if (!state->TestExists(test_name)) {
       fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
               case_name, test_name.c_str(),
-              FormatFileLocation(code_location.file.c_str(),
-                                 code_location.line).c_str());
+              FormatFileLocation(code_location.file.c_str(), code_location.line)
+                  .c_str());
       fflush(stderr);
       posix::Abort();
     }
@@ -831,8 +829,8 @@ class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
-    UnitTest* unit_test, int skip_count);
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
+                                                       int skip_count);
 
 // Helpers for suppressing warnings on unreachable code or constant
 // condition.
@@ -881,7 +879,8 @@ class GTEST_API_ Random {
 
  private:
   uint32_t state_;
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+  Random(const Random&) = delete;
+  Random& operator=(const Random&) = delete;
 };
 
 // Turns const U&, U&, const U, and U all into U.
@@ -954,7 +953,9 @@ IsContainer IsContainerTest(int /* dummy */) {
 
 typedef char IsNotContainer;
 template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+IsNotContainer IsContainerTest(long /* dummy */) {
+  return '\0';
+}
 
 // Trait to detect whether a type T is a hash table.
 // The heuristic used is that the type contains an inner type `hasher` and does
@@ -1017,11 +1018,13 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+inline bool ArrayEq(const T& lhs, const U& rhs) {
+  return lhs == rhs;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
   return internal::ArrayEq(lhs, N, rhs);
 }
 
@@ -1031,8 +1034,7 @@ inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
 template <typename T, typename U>
 bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
   for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
+    if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
   }
   return true;
 }
@@ -1042,8 +1044,7 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
 template <typename Iter, typename Element>
 Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
   for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
+    if (internal::ArrayEq(*it, elem)) return it;
   }
   return end;
 }
@@ -1057,11 +1058,13 @@ void CopyArray(const T* from, size_t size, U* to);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
+inline void CopyArray(const T& from, U* to) {
+  *to = from;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
   internal::CopyArray(from, N, *to);
 }
 
@@ -1114,8 +1117,7 @@ class NativeArray {
   }
 
   ~NativeArray() {
-    if (clone_ != &NativeArray::InitRef)
-      delete[] array_;
+    if (clone_ != &NativeArray::InitRef) delete[] array_;
   }
 
   // STL-style container methods.
@@ -1123,8 +1125,7 @@ class NativeArray {
   const_iterator begin() const { return array_; }
   const_iterator end() const { return array_ + size_; }
   bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
+    return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
   }
 
  private:
@@ -1335,9 +1336,9 @@ struct tuple_size<testing::internal::FlatTuple<Ts...>>
 #endif
 }  // namespace std
 
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
+#define GTEST_MESSAGE_AT_(file, line, message, result_type)             \
+  ::testing::internal::AssertHelper(result_type, file, line, message) = \
+      ::testing::Message()
 
 #define GTEST_MESSAGE_(message, result_type) \
   GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
@@ -1458,103 +1459,112 @@ class NeverThrown {
 
 #endif  // GTEST_HAS_EXCEPTIONS
 
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::TrueWithString gtest_msg{}) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
-    catch (...) { \
-      gtest_msg.value = "it throws."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail(("Expected: " #statement " doesn't throw an exception.\n" \
-            "  Actual: " + gtest_msg.value).c_str())
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
+#define GTEST_TEST_NO_THROW_(statement, fail)                            \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                 \
+    try {                                                                \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);         \
+    }                                                                    \
+    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+    catch (...) {                                                        \
+      gtest_msg.value = "it throws.";                                    \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);      \
+    }                                                                    \
+  } else                                                                 \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__)              \
+        : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+                "  Actual: " +                                           \
+                gtest_msg.value)                                         \
+                   .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail)                       \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                      \
+  if (::testing::internal::AlwaysTrue()) {                           \
+    bool gtest_caught_any = false;                                   \
+    try {                                                            \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);     \
+    } catch (...) {                                                  \
+      gtest_caught_any = true;                                       \
+    }                                                                \
+    if (!gtest_caught_any) {                                         \
       goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
-
+    }                                                                \
+  } else                                                             \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__)         \
+        : fail("Expected: " #statement                               \
+               " throws an exception.\n"                             \
+               "  Actual: it doesn't.")
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
 // representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                       \
+  if (const ::testing::AssertionResult gtest_ar_ =                    \
+          ::testing::AssertionResult(expression))                     \
+    ;                                                                 \
+  else                                                                \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(         \
+             gtest_ar_, text, #actual, #expected)                     \
+             .c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail)                          \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
     ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) {                 \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__);            \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__)                    \
+        : fail("Expected: " #statement                                         \
+               " doesn't generate new fatal "                                  \
+               "failures in the current thread.\n"                             \
+               "  Actual: it does.")
 
 // Expands to the name of the class that implements the given test.
 #define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
   test_suite_name##_##test_name##_Test
 
 // Helper macro for defining tests.
-#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)      \
-  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                \
-                "test_suite_name must not be empty");                         \
-  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                      \
-                "test_name must not be empty");                               \
-  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
-      : public parent_class {                                                 \
-   public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;           \
-    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
-                                                           test_name));       \
-    GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
-                                                           test_name));       \
-                                                                              \
-   private:                                                                   \
-    void TestBody() override;                                                 \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
-  };                                                                          \
-                                                                              \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
-                                                    test_name)::test_info_ =  \
-      ::testing::internal::MakeAndRegisterTestInfo(                           \
-          #test_suite_name, #test_name, nullptr, nullptr,                     \
-          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),         \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),      \
-          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
-              test_suite_name, test_name)>);                                  \
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)       \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                 \
+                "test_suite_name must not be empty");                          \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                       \
+                "test_name must not be empty");                                \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public parent_class {                                                  \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;            \
+    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default;  \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        GTEST_TEST_CLASS_NAME_(test_suite_name,                                \
+                               test_name) &&) noexcept = delete; /* NOLINT */  \
+                                                                               \
+   private:                                                                    \
+    void TestBody() override;                                                  \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;      \
+  };                                                                           \
+                                                                               \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,           \
+                                                    test_name)::test_info_ =   \
+      ::testing::internal::MakeAndRegisterTestInfo(                            \
+          #test_suite_name, #test_name, nullptr, nullptr,                      \
+          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id),  \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),          \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),       \
+          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(     \
+              test_suite_name, test_name)>);                                   \
   void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
index c2ef6e3124..e7af2f904a 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -27,10 +27,11 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Type and function utilities for implementing parameterized tests.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
@@ -46,19 +47,18 @@
 #include <utility>
 #include <vector>
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-printers.h"
 #include "gtest/gtest-test-part.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
 
 namespace testing {
 // Input to a parameterized test name generator, describing a test parameter.
 // Consists of the parameter value and the integer parameter index.
 template <class ParamType>
 struct TestParamInfo {
-  TestParamInfo(const ParamType& a_param, size_t an_index) :
-    param(a_param),
-    index(an_index) {}
+  TestParamInfo(const ParamType& a_param, size_t an_index)
+      : param(a_param), index(an_index) {}
   ParamType param;
   size_t index;
 };
@@ -84,8 +84,10 @@ namespace internal {
 GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
                                            CodeLocation code_location);
 
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
 
 // Interface for iterating over elements provided by an implementation
 // of ParamGeneratorInterface<T>.
@@ -129,8 +131,7 @@ class ParamIterator {
   // ParamIterator assumes ownership of the impl_ pointer.
   ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
   ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
+    if (this != &other) impl_.reset(other.impl_->Clone());
     return *this;
   }
 
@@ -157,7 +158,7 @@ class ParamIterator {
  private:
   friend class ParamGenerator<T>;
   explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  std::unique_ptr<ParamIteratorInterface<T> > impl_;
+  std::unique_ptr<ParamIteratorInterface<T>> impl_;
 };
 
 // ParamGeneratorInterface<T> is the binary interface to access generators
@@ -179,7 +180,7 @@ class ParamGeneratorInterface {
 // This class implements copy initialization semantics and the contained
 // ParamGeneratorInterface<T> instance is shared among all copies
 // of the original object. This is possible because that instance is immutable.
-template<typename T>
+template <typename T>
 class ParamGenerator {
  public:
   typedef ParamIterator<T> iterator;
@@ -196,7 +197,7 @@ class ParamGenerator {
   iterator end() const { return iterator(impl_->End()); }
 
  private:
-  std::shared_ptr<const ParamGeneratorInterface<T> > impl_;
+  std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
 };
 
 // Generates values from a range of two comparable values. Can be used to
@@ -207,8 +208,10 @@ template <typename T, typename IncrementT>
 class RangeGenerator : public ParamGeneratorInterface<T> {
  public:
   RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+      : begin_(begin),
+        end_(end),
+        step_(step),
+        end_index_(CalculateEndIndex(begin, end, step)) {}
   ~RangeGenerator() override {}
 
   ParamIteratorInterface<T>* Begin() const override {
@@ -251,7 +254,9 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
    private:
     Iterator(const Iterator& other)
         : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
+          base_(other.base_),
+          value_(other.value_),
+          index_(other.index_),
           step_(other.step_) {}
 
     // No implementation - assignment is unsupported.
@@ -263,12 +268,10 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
     const IncrementT step_;
   };  // class RangeGenerator::Iterator
 
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
+  static int CalculateEndIndex(const T& begin, const T& end,
                                const IncrementT& step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = static_cast<T>(i + step))
-      end_index++;
+    for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
     return end_index;
   }
 
@@ -283,7 +286,6 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
   const int end_index_;
 };  // class RangeGenerator
 
-
 // Generates values from a pair of STL-style iterators. Used in the
 // ValuesIn() function. The elements are copied from the source range
 // since the source can be located on the stack, and the generator
@@ -341,13 +343,13 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
           << "The program attempted to compare iterators "
           << "from different generators." << std::endl;
       return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+             CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
     }
 
    private:
     Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
+        // The explicit constructor call suppresses a false warning
+        // emitted by gcc when supplied with the -Wextra option.
         : ParamIteratorInterface<T>(),
           base_(other.base_),
           iterator_(other.iterator_) {}
@@ -394,8 +396,8 @@ template <class TestClass>
 class ParameterizedTestFactory : public TestFactoryBase {
  public:
   typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
+  explicit ParameterizedTestFactory(ParamType parameter)
+      : parameter_(parameter) {}
   Test* CreateTest() override {
     TestClass::SetParam(&parameter_);
     return new TestClass();
@@ -404,7 +406,8 @@ class ParameterizedTestFactory : public TestFactoryBase {
  private:
   const ParamType parameter_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+  ParameterizedTestFactory(const ParameterizedTestFactory&) = delete;
+  ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -440,7 +443,8 @@ class TestMetaFactory
   }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+  TestMetaFactory(const TestMetaFactory&) = delete;
+  TestMetaFactory& operator=(const TestMetaFactory&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -471,7 +475,10 @@ class ParameterizedTestSuiteInfoBase {
   ParameterizedTestSuiteInfoBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
+  ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) =
+      delete;
+  ParameterizedTestSuiteInfoBase& operator=(
+      const ParameterizedTestSuiteInfoBase&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -547,8 +554,8 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
          test_it != tests_.end(); ++test_it) {
       std::shared_ptr<TestInfo> test_info = *test_it;
       for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
+               instantiations_.begin();
+           gen_it != instantiations_.end(); ++gen_it) {
         const std::string& instantiation_name = gen_it->name;
         ParamGenerator<ParamType> generator((*gen_it->generator)());
         ParamNameGeneratorFunc* name_func = gen_it->name_func;
@@ -556,7 +563,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
         int line = gen_it->line;
 
         std::string test_suite_name;
-        if ( !instantiation_name.empty() )
+        if (!instantiation_name.empty())
           test_suite_name = instantiation_name + "/";
         test_suite_name += test_info->test_suite_base_name;
 
@@ -569,17 +576,16 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
 
           Message test_name_stream;
 
-          std::string param_name = name_func(
-              TestParamInfo<ParamType>(*param_it, i));
+          std::string param_name =
+              name_func(TestParamInfo<ParamType>(*param_it, i));
 
           GTEST_CHECK_(IsValidParamName(param_name))
               << "Parameterized test name '" << param_name
-              << "' is invalid, in " << file
-              << " line " << line << std::endl;
+              << "' is invalid, in " << file << " line " << line << std::endl;
 
           GTEST_CHECK_(test_param_names.count(param_name) == 0)
-              << "Duplicate parameterized test name '" << param_name
-              << "', in " << file << " line " << line << std::endl;
+              << "Duplicate parameterized test name '" << param_name << "', in "
+              << file << " line " << line << std::endl;
 
           test_param_names.insert(param_name);
 
@@ -596,15 +602,15 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
               SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
               test_info->test_meta_factory->CreateTestFactory(*param_it));
         }  // for param_it
-      }  // for gen_it
-    }  // for test_it
+      }    // for gen_it
+    }      // for test_it
 
     if (!generated_instantiations) {
       // There are no generaotrs, or they all generate nothing ...
       InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
                               !tests_.empty());
     }
-  }    // RegisterTests
+  }  // RegisterTests
 
  private:
   // LocalTestInfo structure keeps information about a single test registered
@@ -620,42 +626,39 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
 
     const std::string test_suite_base_name;
     const std::string test_base_name;
-    const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
     const CodeLocation code_location;
   };
-  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
   // Records data received from INSTANTIATE_TEST_SUITE_P macros:
   //  <Instantiation name, Sequence generator creation function,
   //     Name generator function, Source file, Source line>
   struct InstantiationInfo {
-      InstantiationInfo(const std::string &name_in,
-                        GeneratorCreationFunc* generator_in,
-                        ParamNameGeneratorFunc* name_func_in,
-                        const char* file_in,
-                        int line_in)
-          : name(name_in),
-            generator(generator_in),
-            name_func(name_func_in),
-            file(file_in),
-            line(line_in) {}
-
-      std::string name;
-      GeneratorCreationFunc* generator;
-      ParamNameGeneratorFunc* name_func;
-      const char* file;
-      int line;
+    InstantiationInfo(const std::string& name_in,
+                      GeneratorCreationFunc* generator_in,
+                      ParamNameGeneratorFunc* name_func_in, const char* file_in,
+                      int line_in)
+        : name(name_in),
+          generator(generator_in),
+          name_func(name_func_in),
+          file(file_in),
+          line(line_in) {}
+
+    std::string name;
+    GeneratorCreationFunc* generator;
+    ParamNameGeneratorFunc* name_func;
+    const char* file;
+    int line;
   };
   typedef ::std::vector<InstantiationInfo> InstantiationContainer;
 
   static bool IsValidParamName(const std::string& name) {
     // Check for empty string
-    if (name.empty())
-      return false;
+    if (name.empty()) return false;
 
     // Check for invalid characters
     for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!IsAlNum(name[index]) && name[index] != '_')
-        return false;
+      if (!IsAlNum(name[index]) && name[index] != '_') return false;
     }
 
     return true;
@@ -666,7 +669,9 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
+  ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete;
+  ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) =
+      delete;
 };  // class ParameterizedTestSuiteInfo
 
 //  Legacy API is deprecated but still available
@@ -709,7 +714,7 @@ class ParameterizedTestSuiteRegistry {
           // type we are looking for, so we downcast it to that type
           // without further checks.
           typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info);
+              ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
         }
         break;
       }
@@ -741,7 +746,10 @@ class ParameterizedTestSuiteRegistry {
 
   TestSuiteInfoContainer test_suite_infos_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
+  ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) =
+      delete;
+  ParameterizedTestSuiteRegistry& operator=(
+      const ParameterizedTestSuiteRegistry&) = delete;
 };
 
 // Keep track of what type-parameterized test suite are defined and
@@ -836,7 +844,8 @@ class CartesianProductGenerator
       : public ParamIteratorInterface<ParamType> {
    public:
     IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
-             const std::tuple<ParamGenerator<T>...>& generators, bool is_end)
+                 const std::tuple<ParamGenerator<T>...>& generators,
+                 bool is_end)
         : base_(base),
           begin_(std::get<I>(generators).begin()...),
           end_(std::get<I>(generators).end()...),
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
index dd845915e3..f025db76ad 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the GTEST_OS_* macro.
@@ -37,70 +37,72 @@
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-#  define GTEST_OS_WINDOWS 1
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
 #elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(WINAPI_FAMILY)
-#  include <winapifamily.h>
-#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
-#   define GTEST_OS_WINDOWS_RT 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#   define GTEST_OS_WINDOWS_TV_TITLE 1
-#  else
-    // WINAPI_FAMILY defined but no known partition matched.
-    // Default to desktop.
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  endif
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif  // _WIN32_WCE
 #elif defined __OS2__
-# define GTEST_OS_OS2 1
+#define GTEST_OS_OS2 1
 #elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# include <TargetConditionals.h>
-# if TARGET_OS_IPHONE
-#  define GTEST_OS_IOS 1
-# endif
+#define GTEST_OS_MAC 1
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
 #elif defined __DragonFly__
-# define GTEST_OS_DRAGONFLY 1
+#define GTEST_OS_DRAGONFLY 1
 #elif defined __FreeBSD__
-# define GTEST_OS_FREEBSD 1
+#define GTEST_OS_FREEBSD 1
 #elif defined __Fuchsia__
-# define GTEST_OS_FUCHSIA 1
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GNU__)
+#define GTEST_OS_GNU_HURD 1
 #elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
-# define GTEST_OS_GNU_KFREEBSD 1
+#define GTEST_OS_GNU_KFREEBSD 1
 #elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
 #elif defined __MVS__
-# define GTEST_OS_ZOS 1
+#define GTEST_OS_ZOS 1
 #elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
+#define GTEST_OS_SOLARIS 1
 #elif defined(_AIX)
-# define GTEST_OS_AIX 1
+#define GTEST_OS_AIX 1
 #elif defined(__hpux)
-# define GTEST_OS_HPUX 1
+#define GTEST_OS_HPUX 1
 #elif defined __native_client__
-# define GTEST_OS_NACL 1
+#define GTEST_OS_NACL 1
 #elif defined __NetBSD__
-# define GTEST_OS_NETBSD 1
+#define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
+#define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
-# define GTEST_OS_QNX 1
+#define GTEST_OS_QNX 1
 #elif defined(__HAIKU__)
 #define GTEST_OS_HAIKU 1
 #elif defined ESP8266
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port.h b/third_party/googletest/src/include/gtest/internal/gtest-port.h
index 0953a781c0..0003d27658 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Low-level types and utilities for porting Google Test to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -38,7 +38,9 @@
 // files are expected to #include this.  Therefore, it cannot #include
 // any other Google Test header.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
@@ -116,6 +118,7 @@
 //   GTEST_OS_DRAGONFLY - DragonFlyBSD
 //   GTEST_OS_FREEBSD  - FreeBSD
 //   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_GNU_HURD - GNU/Hurd
 //   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
 //   GTEST_OS_HAIKU    - Haiku
 //   GTEST_OS_HPUX     - HP-UX
@@ -167,7 +170,7 @@
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
 //   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
-//   GOOGLETEST_CM0007 DO NOT DELETE
+//   GTEST_USES_RE2         - the RE2 regular expression library is used
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
@@ -190,10 +193,6 @@
 //   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
 //   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
 //                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables copy operator=.
-//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
-//   GTEST_DISALLOW_MOVE_ASSIGN_   - disables move operator=.
-//   GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=.
 //   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
 //   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
 //                                        suppressed (constant conditional).
@@ -217,11 +216,13 @@
 //                            - synchronization primitives.
 //
 // Regular expressions:
-//   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like platforms
-//                    GOOGLETEST_CM0008 DO NOT DELETE
-//                    or a reduced regular exception syntax on other
-//                    platforms, including Windows.
+//   RE             - a simple regular expression class using
+//                     1) the RE2 syntax on all platforms when built with RE2
+//                        and Abseil as dependencies
+//                     2) the POSIX Extended Regular Expression syntax on
+//                        UNIX-like platforms,
+//                     3) A reduced regular exception syntax on other platforms,
+//                        including Windows.
 // Logging:
 //   GTEST_LOG_()   - logs messages at the specified severity level.
 //   LogToStderr()  - directs all log messages to stderr.
@@ -241,8 +242,6 @@
 //   BiggestInt     - the biggest signed integer type.
 //
 // Command-line utilities:
-//   GTEST_DECLARE_*()  - declares a flag.
-//   GTEST_DEFINE_*()   - defines a flag.
 //   GetInjectableArgvs() - returns the command line as a vector of strings.
 //
 // Environment variable utilities:
@@ -263,48 +262,55 @@
 #include <string.h>
 
 #include <cerrno>
+// #include <condition_variable>  // Guarded by GTEST_IS_THREADSAFE below
 #include <cstdint>
+#include <iostream>
 #include <limits>
+#include <locale>
+#include <memory>
+#include <string>
+// #include <mutex>  // Guarded by GTEST_IS_THREADSAFE below
+#include <tuple>
 #include <type_traits>
+#include <vector>
 
 #ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #endif  // !_WIN32_WCE
 
 #if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
 #endif
 
-#include <iostream>  // NOLINT
-#include <locale>
-#include <memory>
-#include <string>  // NOLINT
-#include <tuple>
-#include <vector>  // NOLINT
-
 #include "gtest/internal/custom/gtest-port.h"
 #include "gtest/internal/gtest-port-arch.h"
 
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#include "absl/flags/reflection.h"
+#endif
+
 #if !defined(GTEST_DEV_EMAIL_)
-# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-# define GTEST_FLAG_PREFIX_ "gtest_"
-# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-# define GTEST_NAME_ "Google Test"
-# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
 #endif  // !defined(GTEST_DEV_EMAIL_)
 
 #if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
 #endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
 
 // Determines the version of gcc that is used to compile this.
 #ifdef __GNUC__
 // 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#define GTEST_GCC_VER_ \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 #endif  // __GNUC__
 
 // Macros for disabling Microsoft Visual C++ warnings.
@@ -313,41 +319,37 @@
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
 #if defined(_MSC_VER)
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
-    __pragma(warning(push))                        \
-    __pragma(warning(disable: warnings))
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
-    __pragma(warning(pop))
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+  __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
 #else
 // Not all compilers are MSVC
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Clang on Windows does not understand MSVC's pragma warning.
 // We need clang-specific way to disable function deprecation warning.
 #ifdef __clang__
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
-    _Pragma("clang diagnostic push")                                  \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
-#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    _Pragma("clang diagnostic pop")
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                            \
+  _Pragma("clang diagnostic push")                                      \
+      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+          _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
 #else
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
-# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Brings in definitions for functions used in the testing::internal::posix
 // namespace (read, write, close, chdir, isatty, stat). We do not currently
 // use them on Windows Mobile.
 #if GTEST_OS_WINDOWS
-# if !GTEST_OS_WINDOWS_MOBILE
-#  include <direct.h>
-#  include <io.h>
-# endif
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
 // In order to avoid having to include <windows.h>, use forward declaration
 #if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
 // MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
@@ -367,68 +369,55 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
 // mentioned above.
-# include <unistd.h>
-# include <strings.h>
+#include <strings.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_LINUX_ANDROID
 // Used to define __ANDROID_API__ matching the target NDK API level.
-#  include <android/api-level.h>  // NOLINT
+#include <android/api-level.h>  // NOLINT
 #endif
 
 // Defines this to true if and only if Google Test can use POSIX regular
 // expressions.
 #ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
 // On Android, <regex.h> is only available starting with Gingerbread.
-#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
 #define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
-# endif
+#endif
 #endif
 
-#if GTEST_USES_PCRE
-// The appropriate headers have already been included.
-
+// Select the regular expression implementation.
+#if GTEST_HAS_ABSL
+// When using Abseil, RE2 is required.
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#define GTEST_USES_RE2 1
 #elif GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise.  We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h>  // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows.  Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
+#include <regex.h>  // NOLINT
+#define GTEST_USES_POSIX_RE 1
 #else
-
-// <regex.h> may not be available on this platform.  Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif  // GTEST_USES_PCRE
+// Use our own simple regex implementation.
+#define GTEST_USES_SIMPLE_RE 1
+#endif
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-# if defined(_MSC_VER) && defined(_CPPUNWIND)
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
 // MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
 // C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__clang__)
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif  // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
 // clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
 // 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
 // there can be cleanups for ObjC exceptions which also need cleanups, even if
@@ -437,27 +426,27 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // cleanups prior to that. To reliably check for C++ exception availability with
 // clang, check for
 // __EXCEPTIONS && __has_feature(cxx_exceptions).
-#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
-# elif defined(__GNUC__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
 // gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
 // Sun Pro CC supports exceptions.  However, there is no compile-time way of
 // detecting whether they are enabled or not.  Therefore, we assume that
 // they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
 // xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
 // Exception handling is in effect by default in HP aCC compiler. It has to
 // be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
+#define GTEST_HAS_EXCEPTIONS 1
+#else
 // For other compilers, we assume exceptions are disabled to be
 // conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 0
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
 #endif  // GTEST_HAS_EXCEPTIONS
 
 #ifndef GTEST_HAS_STD_WSTRING
@@ -477,63 +466,62 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // The user didn't tell us whether RTTI is enabled, so we need to
 // figure it out.
 
-# ifdef _MSC_VER
+#ifdef _MSC_VER
 
 #ifdef _CPPRTTI  // MSVC defines this macro if and only if RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
 // Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
 // enabled.
-# elif defined(__GNUC__)
+#elif defined(__GNUC__)
 
-#  ifdef __GXX_RTTI
+#ifdef __GXX_RTTI
 // When building against STLport with the Android NDK and with
 // -frtti -fno-exceptions, the build fails at link time with undefined
 // references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
 // so disable RTTI when detected.
-#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
-       !defined(__EXCEPTIONS)
-#    define GTEST_HAS_RTTI 0
-#   else
-#    define GTEST_HAS_RTTI 1
-#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif  // __GXX_RTTI
 
 // Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
 // using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
 // first version with C++ support.
-# elif defined(__clang__)
+#elif defined(__clang__)
 
-#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
 
 // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
 // both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
 
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
-# else
+#else
 
 // For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
+#define GTEST_HAS_RTTI 1
 
-# endif  // _MSC_VER
+#endif  // _MSC_VER
 
 #endif  // GTEST_HAS_RTTI
 
 // It's this header's responsibility to #include <typeinfo> when RTTI
 // is enabled.
 #if GTEST_HAS_RTTI
-# include <typeinfo>
+#include <typeinfo>
 #endif
 
 // Determines whether Google Test can use the pthreads library.
@@ -547,16 +535,16 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
   (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
    GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
    GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
-   GTEST_OS_HAIKU)
+   GTEST_OS_HAIKU || GTEST_OS_GNU_HURD)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
 // gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
 // true.
-# include <pthread.h>  // NOLINT
+#include <pthread.h>  // NOLINT
 
 // For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
+#include <time.h>  // NOLINT
 #endif
 
 // Determines whether clone(2) is supported.
@@ -566,24 +554,23 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_CLONE
 // The user didn't tell us, so we need to figure it out.
 
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
 // On Android, clone() became available at different API levels for each 32-bit
 // architecture.
-#    if defined(__LP64__) || \
-        (defined(__arm__) && __ANDROID_API__ >= 9) || \
-        (defined(__mips__) && __ANDROID_API__ >= 12) || \
-        (defined(__i386__) && __ANDROID_API__ >= 17)
-#     define GTEST_HAS_CLONE 1
-#    else
-#     define GTEST_HAS_CLONE 0
-#    endif
-#  else
-#   define GTEST_HAS_CLONE 1
-#  endif
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+    (defined(__mips__) && __ANDROID_API__ >= 12) ||                    \
+    (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif  // GTEST_OS_LINUX && !defined(__ia64__)
 
 #endif  // GTEST_HAS_CLONE
 
@@ -594,10 +581,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // platforms except known mobile ones.
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
     GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
 // Determines whether to support death tests.
@@ -607,8 +594,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
      (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
      GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
      GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
-     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
-# define GTEST_HAS_DEATH_TEST 1
+     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU ||     \
+     GTEST_OS_GNU_HURD)
+#define GTEST_HAS_DEATH_TEST 1
 #endif
 
 // Determines whether to support type-driven tests.
@@ -617,8 +605,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Sun Pro CC, IBM Visual Age, and HP aCC support.
 #if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
     defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
 #endif
 
 // Determines whether the system compiler uses UTF-16 for encoding wide strings.
@@ -627,8 +615,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 
 // Determines whether test results can be streamed to a socket.
 #if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
-    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# define GTEST_CAN_STREAM_RESULTS_ 1
+    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD ||       \
+    GTEST_OS_GNU_HURD
+#define GTEST_CAN_STREAM_RESULTS_ 1
 #endif
 
 // Defines some utility macros.
@@ -642,9 +631,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 // The "switch (0) case 0:" idiom is used to suppress this.
 #ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
 #else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  switch (0)                          \
+  case 0:                             \
+  default:  // NOLINT
 #endif
 
 // Use this annotation at the end of a struct/class definition to
@@ -659,55 +651,32 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Also use it after a variable or parameter declaration to tell the
 // compiler the variable/parameter does not have to be used.
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
 #elif defined(__clang__)
-# if __has_attribute(unused)
-#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-# endif
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
 #endif
 #ifndef GTEST_ATTRIBUTE_UNUSED_
-# define GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
 // Use this annotation before a function that takes a printf format string.
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
-# if defined(__MINGW_PRINTF_FORMAT)
+#if defined(__MINGW_PRINTF_FORMAT)
 // MinGW has two different printf implementations. Ensure the format macro
 // matches the selected implementation. See
 // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
-                                 first_to_check)))
-# else
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__printf__, string_index, first_to_check)))
-# endif
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((                                             \
+      __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
 #else
-# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
 #endif
-
-
-// A macro to disallow copy operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type) \
-  type& operator=(type const &) = delete
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
-  type(type const&) = delete;                 \
-  type& operator=(type const&) = delete
-
-// A macro to disallow move operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \
-  type& operator=(type &&) noexcept = delete
-
-// A macro to disallow move constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
-  type(type&&) noexcept = delete;             \
-  type& operator=(type&&) noexcept = delete
 
 // Tell the compiler to warn about unused return values for functions declared
 // with this macro.  The macro should be used on function declarations
@@ -715,9 +684,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 //   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
 #else
-# define GTEST_MUST_USE_RESULT_
+#define GTEST_MUST_USE_RESULT_
 #endif  // __GNUC__ && !COMPILER_ICC
 
 // MS C++ compiler emits warning when a conditional expression is compile time
@@ -728,10 +697,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // while (true) {
 // GTEST_INTENTIONAL_CONST_COND_POP_()
 // }
-# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
-# define GTEST_INTENTIONAL_CONST_COND_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 
 // Determine whether the compiler supports Microsoft's Structured Exception
 // Handling.  This is supported by several Windows compilers but generally
@@ -739,13 +707,13 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_SEH
 // The user didn't tell us, so we need to figure it out.
 
-# if defined(_MSC_VER) || defined(__BORLANDC__)
+#if defined(_MSC_VER) || defined(__BORLANDC__)
 // These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
+#define GTEST_HAS_SEH 1
+#else
 // Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
+#define GTEST_HAS_SEH 0
+#endif
 
 #endif  // GTEST_HAS_SEH
 
@@ -758,94 +726,112 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 
 #endif  // GTEST_IS_THREADSAFE
 
+#if GTEST_IS_THREADSAFE
+// Some platforms don't support including these threading related headers.
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#endif                         // GTEST_IS_THREADSAFE
+
 // GTEST_API_ qualifies all symbols that must be exported. The definitions below
 // are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
 // gtest/internal/custom/gtest-port.h
 #ifndef GTEST_API_
 
 #ifdef _MSC_VER
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
 #elif __GNUC__ >= 4 || defined(__clang__)
-# define GTEST_API_ __attribute__((visibility ("default")))
+#define GTEST_API_ __attribute__((visibility("default")))
 #endif  // _MSC_VER
 
 #endif  // GTEST_API_
 
 #ifndef GTEST_API_
-# define GTEST_API_
+#define GTEST_API_
 #endif  // GTEST_API_
 
 #ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
-# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
 #endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
+#define GTEST_NO_INLINE_ __attribute__((noinline))
 #else
-# define GTEST_NO_INLINE_
+#define GTEST_NO_INLINE_
+#endif
+
+#if defined(__clang__)
+// Nested ifs to avoid triggering MSVC warning.
+#if __has_attribute(disable_tail_calls)
+// Ask the compiler not to perform tail call optimization inside
+// the marked function.
+#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls))
+#endif
+#elif __GNUC__
+#define GTEST_NO_TAIL_CALL_ \
+  __attribute__((optimize("no-optimize-sibling-calls")))
+#else
+#define GTEST_NO_TAIL_CALL_
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
 #if !defined(GTEST_HAS_CXXABI_H_)
-# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-#  define GTEST_HAS_CXXABI_H_ 1
-# else
-#  define GTEST_HAS_CXXABI_H_ 0
-# endif
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
+#else
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
 // memory when built with MemorySanitizer.
 #if defined(__clang__)
-# if __has_feature(memory_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
-       __attribute__((no_sanitize_memory))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-# endif  // __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __has_feature(memory_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 #endif  // __clang__
 
 // A function level attribute to disable AddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(address_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
-       __attribute__((no_sanitize_address))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-# endif  // __has_feature(address_sanitizer)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+  __attribute__((no_sanitize_address))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __has_feature(address_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable HWAddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(hwaddress_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
-       __attribute__((no_sanitize("hwaddress")))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-# endif  // __has_feature(hwaddress_sanitizer)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+  __attribute__((no_sanitize("hwaddress")))
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __has_feature(hwaddress_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable ThreadSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(thread_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
-       __attribute__((no_sanitize_thread))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-# endif  // __has_feature(thread_sanitizer)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __has_feature(thread_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 #endif  // __clang__
 
 namespace testing {
@@ -867,25 +853,37 @@ namespace internal {
 // Secret object, which is what we want.
 class Secret;
 
-// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
-// time expression is true (in new code, use static_assert instead). For
-// example, you could use it to verify the size of a static array:
-//
-//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
-//                         names_incorrect_size);
-//
-// The second argument to the macro must be a valid C++ identifier. If the
-// expression is false, compiler will issue an error containing this identifier.
-#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
-
 // A helper for suppressing warnings on constant condition.  It just
 // returns 'condition'.
 GTEST_API_ bool IsTrue(bool condition);
 
 // Defines RE.
 
-#if GTEST_USES_PCRE
-// if used, PCRE is injected by custom/gtest-port.h
+#if GTEST_USES_RE2
+
+// This is almost `using RE = ::RE2`, except it is copy-constructible, and it
+// needs to disambiguate the `std::string`, `absl::string_view`, and `const
+// char*` constructors.
+class GTEST_API_ RE {
+ public:
+  RE(absl::string_view regex) : regex_(regex) {}                  // NOLINT
+  RE(const char* regex) : RE(absl::string_view(regex)) {}         // NOLINT
+  RE(const std::string& regex) : RE(absl::string_view(regex)) {}  // NOLINT
+  RE(const RE& other) : RE(other.pattern()) {}
+
+  const std::string& pattern() const { return regex_.pattern(); }
+
+  static bool FullMatch(absl::string_view str, const RE& re) {
+    return RE2::FullMatch(str, re.regex_);
+  }
+  static bool PartialMatch(absl::string_view str, const RE& re) {
+    return RE2::PartialMatch(str, re.regex_);
+  }
+
+ private:
+  RE2 regex_;
+};
+
 #elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
 
 // A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
@@ -924,19 +922,19 @@ class GTEST_API_ RE {
   const char* pattern_;
   bool is_valid_;
 
-# if GTEST_USES_POSIX_RE
+#if GTEST_USES_POSIX_RE
 
   regex_t full_regex_;     // For FullMatch().
   regex_t partial_regex_;  // For PartialMatch().
 
-# else  // GTEST_USES_SIMPLE_RE
+#else  // GTEST_USES_SIMPLE_RE
 
   const char* full_pattern_;  // For FullMatch();
 
-# endif
+#endif
 };
 
-#endif  // GTEST_USES_PCRE
+#endif  // ::testing::internal::RE implementation
 
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
@@ -954,12 +952,7 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
 //   LogToStderr()  - directs all log messages to stderr.
 //   FlushInfoLog() - flushes informational log messages.
 
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
 
 // Formats log entry severity, provides a stream object for streaming the
 // log message, and terminates the message with a newline when going out of
@@ -976,14 +969,16 @@ class GTEST_API_ GTestLog {
  private:
   const GTestLogSeverity severity_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+  GTestLog(const GTestLog&) = delete;
+  GTestLog& operator=(const GTestLog&) = delete;
 };
 
 #if !defined(GTEST_LOG_)
 
-# define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
+#define GTEST_LOG_(severity)                                           \
+  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                __FILE__, __LINE__)                    \
+      .GetStream()
 
 inline void LogToStderr() {}
 inline void FlushInfoLog() { fflush(nullptr); }
@@ -995,7 +990,7 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //
 // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
 // is not satisfied.
-//  Synopsys:
+//  Synopsis:
 //    GTEST_CHECK_(boolean_condition);
 //     or
 //    GTEST_CHECK_(boolean_condition) << "Additional message";
@@ -1005,12 +1000,12 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-# define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#define GTEST_CHECK_(condition)               \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
+  if (::testing::internal::IsTrue(condition)) \
+    ;                                         \
+  else                                        \
+    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
 #endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
@@ -1019,9 +1014,8 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // in {} if you need to use it as the only statement in an 'if'
 // branch.
 #define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
+  if (const int gtest_error = (posix_call))    \
+  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
 
 // Transforms "T" into "const T&" according to standard reference collapsing
 // rules (this is only needed as a backport for C++98 compilers that do not
@@ -1035,9 +1029,13 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // Note that the non-const reference will not have "const" added. This is
 // standard, and necessary so that "T" can always bind to "const T&".
 template <typename T>
-struct ConstRef { typedef const T& type; };
+struct ConstRef {
+  typedef const T& type;
+};
 template <typename T>
-struct ConstRef<T&> { typedef T& type; };
+struct ConstRef<T&> {
+  typedef T& type;
+};
 
 // The argument T must depend on some template parameters.
 #define GTEST_REFERENCE_TO_CONST_(T) \
@@ -1050,7 +1048,7 @@ struct ConstRef<T&> { typedef T& type; };
 // const Foo*).  When you use ImplicitCast_, the compiler checks that
 // the cast is safe.  Such explicit ImplicitCast_s are necessary in
 // surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertable to a target type.
+// instead of an argument type convertible to a target type.
 //
 // The syntax for using ImplicitCast_ is the same as for static_cast:
 //
@@ -1063,8 +1061,10 @@ struct ConstRef<T&> { typedef T& type; };
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., implicit_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
+template <typename To>
+inline To ImplicitCast_(To x) {
+  return x;
+}
 
 // When you upcast (that is, cast a pointer from type Foo to type
 // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -1087,17 +1087,17 @@ inline To ImplicitCast_(To x) { return x; }
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., down_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
+template <typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {         // so we only accept pointers
   // Ensures that To is a sub-type of From *.  This test is here only
   // for compile-time type checking, and has no overhead in an
   // optimized build at run-time, as it will be optimized away
   // completely.
   GTEST_INTENTIONAL_CONST_COND_PUSH_()
   if (false) {
-  GTEST_INTENTIONAL_CONST_COND_POP_()
-  const To to = nullptr;
-  ::testing::internal::ImplicitCast_<From*>(to);
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = nullptr;
+    ::testing::internal::ImplicitCast_<From*>(to);
   }
 
 #if GTEST_HAS_RTTI
@@ -1162,71 +1162,8 @@ void ClearInjectableArgvs();
 
 // Defines synchronization primitives.
 #if GTEST_IS_THREADSAFE
-# if GTEST_HAS_PTHREAD
-// Sleeps for (roughly) n milliseconds.  This function is only for testing
-// Google Test's own constructs.  Don't use it in user tests, either
-// directly or indirectly.
-inline void SleepMilliseconds(int n) {
-  const timespec time = {
-    0,                  // 0 seconds.
-    n * 1000L * 1000L,  // And n ms.
-  };
-  nanosleep(&time, nullptr);
-}
-# endif  // GTEST_HAS_PTHREAD
-
-# if GTEST_HAS_NOTIFICATION_
-// Notification has already been imported into the namespace.
-// Nothing to do here.
-
-# elif GTEST_HAS_PTHREAD
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
-  Notification() : notified_(false) {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
-  }
-  ~Notification() {
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  // Notifies all threads created with this notification to start. Must
-  // be called from the controller thread.
-  void Notify() {
-    pthread_mutex_lock(&mutex_);
-    notified_ = true;
-    pthread_mutex_unlock(&mutex_);
-  }
-
-  // Blocks until the controller thread notifies. Must be called from a test
-  // thread.
-  void WaitForNotification() {
-    for (;;) {
-      pthread_mutex_lock(&mutex_);
-      const bool notified = notified_;
-      pthread_mutex_unlock(&mutex_);
-      if (notified)
-        break;
-      SleepMilliseconds(10);
-    }
-  }
-
- private:
-  pthread_mutex_t mutex_;
-  bool notified_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-
-GTEST_API_ void SleepMilliseconds(int n);
 
+#if GTEST_OS_WINDOWS
 // Provides leak-safe Windows kernel handle ownership.
 // Used in death tests and in threading support.
 class GTEST_API_ AutoHandle {
@@ -1253,8 +1190,18 @@ class GTEST_API_ AutoHandle {
 
   Handle handle_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+  AutoHandle(const AutoHandle&) = delete;
+  AutoHandle& operator=(const AutoHandle&) = delete;
 };
+#endif
+
+#if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+#else
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
 
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
@@ -1262,23 +1209,40 @@ class GTEST_API_ AutoHandle {
 //
 // This class is only for testing Google Test's own constructs. Do not
 // use it in user tests, either directly or indirectly.
+// TODO(b/203539622): Replace unconditionally with absl::Notification.
 class GTEST_API_ Notification {
  public:
-  Notification();
-  void Notify();
-  void WaitForNotification();
+  Notification() : notified_(false) {}
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
 
- private:
-  AutoHandle event_;
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    std::lock_guard<std::mutex> lock(mu_);
+    notified_ = true;
+    cv_.notify_all();
+  }
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> lock(mu_);
+    cv_.wait(lock, [this]() { return notified_; });
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
 };
-# endif  // GTEST_HAS_NOTIFICATION_
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+#endif  // GTEST_HAS_NOTIFICATION_
 
 // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
 // defined, but we don't want to use MinGW's pthreads implementation, which
 // has conformance problems with some versions of the POSIX standard.
-# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
 
 // As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
 // Consequently, it cannot select a correct instantiation of ThreadWithParam
@@ -1354,16 +1318,17 @@ class ThreadWithParam : public ThreadWithParamBase {
                    // finished.
   pthread_t thread_;  // The native thread object.
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
 };
-# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
-         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+        // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
-# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 // Mutex and ThreadLocal have already been imported into the namespace.
 // Nothing to do here.
 
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Mutex implements mutex on Windows platforms.  It is used in conjunction
 // with class MutexLock:
@@ -1417,14 +1382,15 @@ class GTEST_API_ Mutex {
   long critical_section_init_phase_;  // NOLINT
   GTEST_CRITICAL_SECTION* critical_section_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::Mutex mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1433,15 +1399,15 @@ class GTEST_API_ Mutex {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(Mutex* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
   Mutex* const mutex_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1468,7 +1434,8 @@ class ThreadLocalBase {
   virtual ~ThreadLocalBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+  ThreadLocalBase(const ThreadLocalBase&) = delete;
+  ThreadLocalBase& operator=(const ThreadLocalBase&) = delete;
 };
 
 // Maps a thread to a set of ThreadLocals that have values instantiated on that
@@ -1497,7 +1464,7 @@ class GTEST_API_ ThreadWithParamBase {
     virtual void Run() = 0;
   };
 
-  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start);
   virtual ~ThreadWithParamBase();
 
  private:
@@ -1511,30 +1478,26 @@ class ThreadWithParam : public ThreadWithParamBase {
   typedef void UserThreadFunc(T);
 
   ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
-      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
-  }
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
   virtual ~ThreadWithParam() {}
 
  private:
   class RunnableImpl : public Runnable {
    public:
-    RunnableImpl(UserThreadFunc* func, T param)
-        : func_(func),
-          param_(param) {
-    }
+    RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {}
     virtual ~RunnableImpl() {}
-    virtual void Run() {
-      func_(param_);
-    }
+    virtual void Run() { func_(param_); }
 
    private:
     UserThreadFunc* const func_;
     const T param_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+    RunnableImpl(const RunnableImpl&) = delete;
+    RunnableImpl& operator=(const RunnableImpl&) = delete;
   };
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
 };
 
 // Implements thread-local storage on Windows systems.
@@ -1571,7 +1534,7 @@ class ThreadLocal : public ThreadLocalBase {
   explicit ThreadLocal(const T& value)
       : default_factory_(new InstanceValueHolderFactory(value)) {}
 
-  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+  ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
 
   T* pointer() { return GetOrCreateValue(); }
   const T* pointer() const { return GetOrCreateValue(); }
@@ -1590,16 +1553,17 @@ class ThreadLocal : public ThreadLocalBase {
 
    private:
     T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
   };
 
-
   T* GetOrCreateValue() const {
     return static_cast<ValueHolder*>(
-        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+               ThreadLocalRegistry::GetValueOnCurrentThread(this))
+        ->pointer();
   }
 
-  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+  ThreadLocalValueHolderBase* NewValueForCurrentThread() const override {
     return default_factory_->MakeNewHolder();
   }
 
@@ -1610,7 +1574,8 @@ class ThreadLocal : public ThreadLocalBase {
     virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
   };
 
   class DefaultValueHolderFactory : public ValueHolderFactory {
@@ -1619,7 +1584,9 @@ class ThreadLocal : public ThreadLocalBase {
     ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
   };
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
@@ -1632,15 +1599,18 @@ class ThreadLocal : public ThreadLocalBase {
    private:
     const T value_;  // The value for each thread.
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
   };
 
   std::unique_ptr<ValueHolderFactory> default_factory_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
 };
 
-# elif GTEST_HAS_PTHREAD
+#elif GTEST_HAS_PTHREAD
 
 // MutexBase and Mutex implement mutex on pthreads-based platforms.
 class MutexBase {
@@ -1687,8 +1657,8 @@ class MutexBase {
 };
 
 // Forward-declares a static mutex.
-#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-     extern ::testing::internal::MutexBase mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::MutexBase mutex
 
 // Defines and statically (i.e. at link time) initializes a static mutex.
 // The initialization list here does not explicitly initialize each field,
@@ -1707,12 +1677,11 @@ class Mutex : public MutexBase {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
     has_owner_ = false;
   }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
+  ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
 };
 
 // We cannot name this class MutexLock because the ctor declaration would
@@ -1722,15 +1691,15 @@ class Mutex : public MutexBase {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
   MutexBase* const mutex_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1787,7 +1756,8 @@ class GTEST_API_ ThreadLocal {
 
    private:
     T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
   };
 
   static pthread_key_t CreateKey() {
@@ -1819,7 +1789,8 @@ class GTEST_API_ ThreadLocal {
     virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
   };
 
   class DefaultValueHolderFactory : public ValueHolderFactory {
@@ -1828,7 +1799,9 @@ class GTEST_API_ ThreadLocal {
     ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
   };
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
@@ -1841,17 +1814,20 @@ class GTEST_API_ ThreadLocal {
    private:
     const T value_;  // The value for each thread.
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
   };
 
   // A key pthreads uses for looking up per-thread values.
   const pthread_key_t key_;
   std::unique_ptr<ValueHolderFactory> default_factory_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
 };
 
-# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
 #else  // GTEST_IS_THREADSAFE
 
@@ -1868,10 +1844,10 @@ class Mutex {
   void AssertHeld() const {}
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
   extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1894,6 +1870,7 @@ class GTEST_API_ ThreadLocal {
   const T* pointer() const { return &value_; }
   const T& get() const { return value_; }
   void set(const T& value) { value_ = value; }
+
  private:
   T value_;
 };
@@ -1905,11 +1882,11 @@ class GTEST_API_ ThreadLocal {
 GTEST_API_ size_t GetThreadCount();
 
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
 #else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
 #endif  // GTEST_OS_WINDOWS
 
 // Utilities for char.
@@ -1967,8 +1944,7 @@ inline char ToUpper(char ch) {
 
 inline std::string StripTrailingSpaces(std::string str) {
   std::string::iterator it = str.end();
-  while (it != str.begin() && IsSpace(*--it))
-    it = str.erase(it);
+  while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
   return str;
 }
 
@@ -1986,36 +1962,35 @@ namespace posix {
 
 typedef struct _stat StatStruct;
 
-# ifdef __BORLANDC__
+#ifdef __BORLANDC__
 inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
+#else  // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
 inline int DoIsATTY(int /* fd */) { return 0; }
-#  else
+#else
 inline int DoIsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return _stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
+#endif  // __BORLANDC__
 
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
 // Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
 // time and thus not defined there.
-# else
+#else
 inline int FileNo(FILE* file) { return _fileno(file); }
 inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
 inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
-}
-# endif  // GTEST_OS_WINDOWS_MOBILE
+inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
 #elif GTEST_OS_ESP8266
 typedef struct stat StatStruct;
@@ -2079,12 +2054,12 @@ inline FILE* FOpen(const char* path, const char* mode) {
   std::wstring wide_path = converter.from_bytes(path);
   std::wstring wide_mode = converter.from_bytes(mode);
   return _wfopen(wide_path.c_str(), wide_mode.c_str());
-#else  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+#else   // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
   return fopen(path, mode);
 #endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
 }
 #if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+inline FILE* FReopen(const char* path, const char* mode, FILE* stream) {
   return freopen(path, mode, stream);
 }
 inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
@@ -2136,13 +2111,13 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_()
 // snprintf is a variadic function.
 #if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
 // MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
-     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+  _snprintf_s(buffer, size, size, format, __VA_ARGS__)
 #elif defined(_MSC_VER)
 // Windows CE does not define _snprintf_s
-# define GTEST_SNPRINTF_ _snprintf
+#define GTEST_SNPRINTF_ _snprintf
 #else
-# define GTEST_SNPRINTF_ snprintf
+#define GTEST_SNPRINTF_ snprintf
 #endif
 
 // The biggest signed integer type the compiler supports.
@@ -2202,37 +2177,84 @@ using TimeInMillis = int64_t;  // Represents time in milliseconds.
 
 // Macro for referencing flags.
 #if !defined(GTEST_FLAG)
-# define GTEST_FLAG(name) FLAGS_gtest_##name
+#define GTEST_FLAG_NAME_(name) gtest_##name
+#define GTEST_FLAG(name) FLAGS_gtest_##name
 #endif  // !defined(GTEST_FLAG)
 
-#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
-#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
 
-#if !defined(GTEST_DECLARE_bool_)
-# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+  ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+  ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+  ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc)
 
 // Macros for declaring flags.
-# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-# define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern std::int32_t GTEST_FLAG(name)
-# define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+#define GTEST_DECLARE_bool_(name) \
+  ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_int32_(name) \
+  ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_string_(name) \
+  ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name))
+
+#define GTEST_FLAG_SAVER_ ::absl::FlagSaver
+
+#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name))
+#define GTEST_FLAG_SET(name, value) \
+  (void)(::absl::SetFlag(&GTEST_FLAG(name), value))
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0
+
+#else  // GTEST_HAS_ABSL
 
 // Macros for defining flags.
-# define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_bool_(name, default_val, doc)  \
+  namespace testing {                               \
+  GTEST_API_ bool GTEST_FLAG(name) = (default_val); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_int32_(name, default_val, doc)         \
+  namespace testing {                                       \
+  GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \
+  }                                                         \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_string_(name, default_val, doc)         \
+  namespace testing {                                        \
+  GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \
+  }                                                          \
+  static_assert(true, "no-op to require trailing semicolon")
 
-#endif  // !defined(GTEST_DECLARE_bool_)
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name)          \
+  namespace testing {                      \
+  GTEST_API_ extern bool GTEST_FLAG(name); \
+  }                                        \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_int32_(name)                 \
+  namespace testing {                              \
+  GTEST_API_ extern std::int32_t GTEST_FLAG(name); \
+  }                                                \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_string_(name)                 \
+  namespace testing {                               \
+  GTEST_API_ extern ::std::string GTEST_FLAG(name); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
+#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+
+#endif  // GTEST_HAS_ABSL
 
 // Thread annotations
 #if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-# define GTEST_LOCK_EXCLUDED_(locks)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
 #endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
@@ -2308,6 +2330,7 @@ namespace testing {
 namespace internal {
 template <typename T>
 using Optional = ::absl::optional<T>;
+inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; }
 }  // namespace internal
 }  // namespace testing
 #else
@@ -2321,6 +2344,7 @@ namespace testing {
 namespace internal {
 template <typename T>
 using Optional = ::std::optional<T>;
+inline ::std::nullopt_t Nullopt() { return ::std::nullopt; }
 }  // namespace internal
 }  // namespace testing
 // The case where absl is configured NOT to alias std::optional is not
@@ -2332,7 +2356,7 @@ using Optional = ::std::optional<T>;
 #if GTEST_HAS_ABSL
 // Always use absl::string_view for Matcher<> specializations if googletest
 // is built with absl support.
-# define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
 #include "absl/strings/string_view.h"
 namespace testing {
 namespace internal {
@@ -2340,11 +2364,11 @@ using StringView = ::absl::string_view;
 }  // namespace internal
 }  // namespace testing
 #else
-# ifdef __has_include
-#   if __has_include(<string_view>) && __cplusplus >= 201703L
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
 // Otherwise for C++17 and higher use std::string_view for Matcher<>
 // specializations.
-#   define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
 #include <string_view>
 namespace testing {
 namespace internal {
@@ -2353,8 +2377,8 @@ using StringView = ::std::string_view;
 }  // namespace testing
 // The case where absl is configured NOT to alias std::string_view is not
 // supported.
-#  endif  // __has_include(<string_view>) && __cplusplus >= 201703L
-# endif  // __has_include
+#endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif  // __has_include
 #endif  // GTEST_HAS_ABSL
 
 #if GTEST_HAS_ABSL
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-string.h b/third_party/googletest/src/include/gtest/internal/gtest-string.h
index 10f774f966..cca2e1f2ad 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-string.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares the String class and functions used internally by
@@ -36,17 +36,20 @@
 // This header file is #included by gtest-internal.h.
 // It should not be #included by other files.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
 #ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
+#include <mem.h>
 #endif
 
 #include <string.h>
+
 #include <cstdint>
 #include <string>
 
@@ -123,8 +126,7 @@ class GTEST_API_ String {
   // Unlike strcasecmp(), this function can handle NULL argument(s).
   // A NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
+  static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs);
 
   // Compares two wide C strings, ignoring case.  Returns true if and only if
   // they have the same content.
@@ -143,8 +145,8 @@ class GTEST_API_ String {
 
   // Returns true if and only if the given string ends with the given suffix,
   // ignoring case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(
-      const std::string& str, const std::string& suffix);
+  static bool EndsWithCaseInsensitive(const std::string& str,
+                                      const std::string& suffix);
 
   // Formats an int value as "%02d".
   static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
@@ -163,7 +165,7 @@ class GTEST_API_ String {
 
  private:
   String();  // Not meant to be instantiated.
-};  // class String
+};           // class String
 
 // Gets the content of the stringstream's buffer as an std::string.  Each '\0'
 // character in the buffer is replaced with "\\0".
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
index b87a2e2cac..6bc02a7de3 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -30,7 +30,9 @@
 // Type utilities needed for implementing typed and type-parameterized
 // tests.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
@@ -39,11 +41,11 @@
 
 // #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
 // libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif  // GTEST_HASH_CXXABI_H_
 
 namespace testing {
 namespace internal {
@@ -101,7 +103,9 @@ std::string GetTypeName() {
 // A unique type indicating an empty node
 struct None {};
 
-# define GTEST_TEMPLATE_ template <typename T> class
+#define GTEST_TEMPLATE_ \
+  template <typename T> \
+  class
 
 // The template "selector" struct TemplateSel<Tmpl> is used to
 // represent Tmpl, which must be a class template with one type
@@ -119,8 +123,7 @@ struct TemplateSel {
   };
 };
 
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
 
 template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
 struct Templates {
diff --git a/third_party/googletest/src/src/gtest-all.cc b/third_party/googletest/src/src/gtest-all.cc
index ad292905cf..2a70ed88c7 100644
--- a/third_party/googletest/src/src/gtest-all.cc
+++ b/third_party/googletest/src/src/gtest-all.cc
@@ -38,7 +38,7 @@
 #include "gtest/gtest.h"
 
 // The following lines pull in the real gtest *.cc files.
-#include "src/gtest.cc"
+#include "src/gtest-assertion-result.cc"
 #include "src/gtest-death-test.cc"
 #include "src/gtest-filepath.cc"
 #include "src/gtest-matchers.cc"
@@ -46,3 +46,4 @@
 #include "src/gtest-printers.cc"
 #include "src/gtest-test-part.cc"
 #include "src/gtest-typed-test.cc"
+#include "src/gtest.cc"
diff --git a/third_party/googletest/src/src/gtest-assertion-result.cc b/third_party/googletest/src/src/gtest-assertion-result.cc
new file mode 100644
index 0000000000..f1c0b10dc9
--- /dev/null
+++ b/third_party/googletest/src/src/gtest-assertion-result.cc
@@ -0,0 +1,77 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file defines the AssertionResult type.
+
+#include "gtest/gtest-assertion-result.h"
+
+#include <string>
+#include <utility>
+
+#include "gtest/gtest-message.h"
+
+namespace testing {
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != nullptr
+                   ? new ::std::string(*other.message_)
+                   : static_cast< ::std::string*>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != nullptr) negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() { return AssertionResult(false); }
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+}  // namespace testing
diff --git a/third_party/googletest/src/src/gtest-death-test.cc b/third_party/googletest/src/src/gtest-death-test.cc
index bf4f6331da..e6abc6278a 100644
--- a/third_party/googletest/src/src/gtest-death-test.cc
+++ b/third_party/googletest/src/src/gtest-death-test.cc
@@ -35,49 +35,49 @@
 #include <functional>
 #include <utility>
 
-#include "gtest/internal/gtest-port.h"
 #include "gtest/internal/custom/gtest.h"
+#include "gtest/internal/gtest-port.h"
 
 #if GTEST_HAS_DEATH_TEST
 
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-
-# if GTEST_OS_LINUX
-#  include <signal.h>
-# endif  // GTEST_OS_LINUX
-
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-# if GTEST_OS_QNX
-#  include <spawn.h>
-# endif  // GTEST_OS_QNX
-
-# if GTEST_OS_FUCHSIA
-#  include <lib/fdio/fd.h>
-#  include <lib/fdio/io.h>
-#  include <lib/fdio/spawn.h>
-#  include <lib/zx/channel.h>
-#  include <lib/zx/port.h>
-#  include <lib/zx/process.h>
-#  include <lib/zx/socket.h>
-#  include <zircon/processargs.h>
-#  include <zircon/syscalls.h>
-#  include <zircon/syscalls/policy.h>
-#  include <zircon/syscalls/port.h>
-# endif  // GTEST_OS_FUCHSIA
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif  // GTEST_OS_MAC
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif  // GTEST_OS_LINUX
+
+#include <stdarg.h>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif  // GTEST_OS_FUCHSIA
 
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -96,9 +96,12 @@ namespace testing {
 // used internally at Google, is "threadsafe".
 static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
 
+}  // namespace testing
+
 GTEST_DEFINE_string_(
     death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    testing::internal::StringFromGTestEnv("death_test_style",
+                                          testing::kDefaultDeathTestStyle),
     "Indicates how to run a death test in a forked child process: "
     "\"threadsafe\" (child process re-executes the test binary "
     "from the beginning, running only the specific death test) or "
@@ -107,7 +110,7 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_bool_(
     death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    testing::internal::BoolFromGTestEnv("death_test_use_fork", false),
     "Instructs to use fork()/_exit() instead of clone() in death tests. "
     "Ignored and always uses fork() on POSIX systems where clone() is not "
     "implemented. Useful when running under valgrind or similar tools if "
@@ -117,7 +120,6 @@ GTEST_DEFINE_bool_(
     "work in 99% of the cases. Once valgrind is fixed, this flag will "
     "most likely be removed.");
 
-namespace internal {
 GTEST_DEFINE_string_(
     internal_run_death_test, "",
     "Indicates the file, line number, temporal index of "
@@ -126,7 +128,8 @@ GTEST_DEFINE_string_(
     "the '|' characters.  This flag is specified if and only if the "
     "current process is a sub-process launched for running a thread-safe "
     "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
+
+namespace testing {
 
 #if GTEST_HAS_DEATH_TEST
 
@@ -134,9 +137,9 @@ namespace internal {
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
-# endif
+#endif
 
 // Returns a Boolean value indicating whether the caller is currently
 // executing in the context of the death test child process.  Tools such as
@@ -144,16 +147,16 @@ static bool g_in_fast_death_test_child = false;
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   // On Windows and Fuchsia, death tests are thread-safe regardless of the value
   // of the death_test_style flag.
-  return !GTEST_FLAG(internal_run_death_test).empty();
+  return !GTEST_FLAG_GET(internal_run_death_test).empty();
 
-# else
+#else
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe")
-    return !GTEST_FLAG(internal_run_death_test).empty();
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe")
+    return !GTEST_FLAG_GET(internal_run_death_test).empty();
   else
     return g_in_fast_death_test_child;
 #endif
@@ -162,40 +165,38 @@ bool InDeathTestChild() {
 }  // namespace internal
 
 // ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
-# else
+#else
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
-#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   {
     bool result;
     if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
       return result;
     }
   }
-#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -206,23 +207,23 @@ namespace internal {
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
-# else
+#else
 
   if (WIFEXITED(exit_code)) {
     m << "Exited with exit status " << WEXITSTATUS(exit_code);
   } else if (WIFSIGNALED(exit_code)) {
     m << "Terminated by signal " << WTERMSIG(exit_code);
   }
-#  ifdef WCOREDUMP
+#ifdef WCOREDUMP
   if (WCOREDUMP(exit_code)) {
     m << " (core dumped)";
   }
-#  endif
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -233,7 +234,7 @@ bool ExitedUnsuccessfully(int exit_status) {
   return !ExitedWithCode(0)(exit_status);
 }
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -254,7 +255,7 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
       << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -304,14 +305,14 @@ static void DeathTestAbort(const std::string& message) {
 
 // A replacement for CHECK that calls DeathTestAbort if the assertion
 // fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_(expression)                              \
+  do {                                                                   \
+    if (!::testing::internal::IsTrue(expression)) {                      \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression);                                \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
@@ -321,23 +322,23 @@ static void DeathTestAbort(const std::string& message) {
 // evaluates the expression as long as it evaluates to -1 and sets
 // errno to EINTR.  If the expression evaluates to -1 but errno is
 // something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression + " != -1"); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression)                      \
+  do {                                                                   \
+    int gtest_retval;                                                    \
+    do {                                                                 \
+      gtest_retval = (expression);                                       \
+    } while (gtest_retval == -1 && errno == EINTR);                      \
+    if (gtest_retval == -1) {                                            \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression + " != -1");                     \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // Returns the message describing the last system error in errno.
 std::string GetLastErrnoDescription() {
-    return errno == 0 ? "" : posix::StrError(errno);
+  return errno == 0 ? "" : posix::StrError(errno);
 }
 
 // This is called from a death test parent process to read a failure
@@ -370,8 +371,9 @@ static void FailFromInternalError(int fd) {
 DeathTest::DeathTest() {
   TestInfo* const info = GetUnitTestImpl()->current_test_info();
   if (info == nullptr) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
+    DeathTestAbort(
+        "Cannot run a death test outside of a TEST or "
+        "TEST_F construct");
   }
 }
 
@@ -500,9 +502,7 @@ void DeathTestImpl::ReadAndInterpretStatusByte() {
   set_read_fd(-1);
 }
 
-std::string DeathTestImpl::GetErrorLogs() {
-  return GetCapturedStderr();
-}
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
 
 // Signals that the death test code which should have exited, didn't.
 // Should be called only in a death test child process.
@@ -512,9 +512,9 @@ void DeathTestImpl::Abort(AbortReason reason) {
   // The parent process considers the death test to be a failure if
   // it finds any data in our pipe.  So, here we write a single flag byte
   // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+  const char status_ch = reason == TEST_DID_NOT_DIE       ? kDeathTestLived
+                         : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew
+                                                          : kDeathTestReturned;
 
   GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
   // We are leaking the descriptor here because on some platforms (i.e.,
@@ -533,7 +533,7 @@ void DeathTestImpl::Abort(AbortReason reason) {
 // much easier.
 static ::std::string FormatDeathTestOutput(const ::std::string& output) {
   ::std::string ret;
-  for (size_t at = 0; ; ) {
+  for (size_t at = 0;;) {
     const size_t line_end = output.find('\n', at);
     ret += "[  DEATH   ] ";
     if (line_end == ::std::string::npos) {
@@ -568,8 +568,7 @@ static ::std::string FormatDeathTestOutput(const ::std::string& output) {
 // the first failing condition, in the order given above, is the one that is
 // reported. Also sets the last death test message string.
 bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
+  if (!spawned()) return false;
 
   const std::string error_message = GetErrorLogs();
 
@@ -580,15 +579,18 @@ bool DeathTestImpl::Passed(bool status_ok) {
   switch (outcome()) {
     case LIVED:
       buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case THREW:
       buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case RETURNED:
       buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case DIED:
       if (status_ok) {
@@ -605,7 +607,8 @@ bool DeathTestImpl::Passed(bool status_ok) {
       } else {
         buffer << "    Result: died but not with expected exit code:\n"
                << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+               << "Actual msg:\n"
+               << FormatDeathTestOutput(error_message);
       }
       break;
     case IN_PROGRESS:
@@ -618,7 +621,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
   return success;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // WindowsDeathTest implements death tests on Windows. Due to the
 // specifics of starting new processes on Windows, death tests there are
 // always threadsafe, and Google Test considers the
@@ -679,14 +682,12 @@ class WindowsDeathTest : public DeathTestImpl {
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Wait until the child either signals that it has acquired the write end
   // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
+  const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()};
+  switch (::WaitForMultipleObjects(2, wait_handles,
                                    FALSE,  // Waits for any of the handles.
                                    INFINITE)) {
     case WAIT_OBJECT_0:
@@ -707,9 +708,8 @@ int WindowsDeathTest::Wait() {
   // returns immediately if the child has already exited, regardless of
   // whether previous calls to WaitForMultipleObjects synchronized on this
   // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
+  GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+                          ::WaitForSingleObject(child_handle_.Get(), INFINITE));
   DWORD status_code;
   GTEST_DEATH_TEST_CHECK_(
       ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
@@ -742,12 +742,12 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
                                                  nullptr, TRUE};
   HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
+  GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+                                       &handles_are_inheritable,
+                                       0)  // Default buffer size.
+                          != FALSE);
+  set_read_fd(
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
   write_handle_.Reset(write_handle);
   event_handle_.Reset(::CreateEvent(
       &handles_are_inheritable,
@@ -756,27 +756,26 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
       nullptr));  // The even is unnamed.
   GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
   const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
-      "=" + file_ + "|" + StreamableToString(line_) + "|" +
-      StreamableToString(death_test_index) + "|" +
+      std::string("--") + GTEST_FLAG_PREFIX_ +
+      "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) +
+      "|" + StreamableToString(death_test_index) + "|" +
       StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
       // size_t has the same width as pointers on both 32-bit and 64-bit
       // Windows platforms.
       // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
-      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+      StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
   char executable_path[_MAX_PATH + 1];  // NOLINT
   GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
                                                                 executable_path,
                                                                 _MAX_PATH));
 
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
+  std::string command_line = std::string(::GetCommandLineA()) + " " +
+                             filter_flag + " \"" + internal_flag + "\"";
 
   DeathTest::set_last_death_test_message("");
 
@@ -796,8 +795,8 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   GTEST_DEATH_TEST_CHECK_(
       ::CreateProcessA(
           executable_path, const_cast<char*>(command_line.c_str()),
-          nullptr,  // Retuned process handle is not inheritable.
-          nullptr,  // Retuned thread handle is not inheritable.
+          nullptr,  // Returned process handle is not inheritable.
+          nullptr,  // Returned thread handle is not inheritable.
           TRUE,  // Child inherits all inheritable handles (for write_handle_).
           0x0,   // Default creation flags.
           nullptr,  // Inherit the parent's environment.
@@ -809,7 +808,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
 class FuchsiaDeathTest : public DeathTestImpl {
  public:
@@ -855,18 +854,13 @@ class Arguments {
   template <typename Str>
   void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char* const* Argv() { return &args_[0]; }
 
-  int size() {
-    return static_cast<int>(args_.size()) - 1;
-  }
+  int size() { return static_cast<int>(args_.size()) - 1; }
 
  private:
   std::vector<char*> args_;
@@ -880,8 +874,7 @@ int FuchsiaDeathTest::Wait() {
   const int kSocketKey = 1;
   const int kExceptionKey = 2;
 
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Create a port to wait for socket/task/exception events.
   zx_status_t status_zx;
@@ -890,8 +883,8 @@ int FuchsiaDeathTest::Wait() {
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the child process to terminate.
-  status_zx = child_process_.wait_async(
-      port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
+  status_zx =
+      child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the socket to be readable or closed.
@@ -900,8 +893,8 @@ int FuchsiaDeathTest::Wait() {
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for an exception.
-  status_zx = exception_channel_.wait_async(
-      port, kExceptionKey, ZX_CHANNEL_READABLE, 0);
+  status_zx = exception_channel_.wait_async(port, kExceptionKey,
+                                            ZX_CHANNEL_READABLE, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   bool process_terminated = false;
@@ -931,9 +924,9 @@ int FuchsiaDeathTest::Wait() {
           size_t old_length = captured_stderr_.length();
           size_t bytes_read = 0;
           captured_stderr_.resize(old_length + kBufferSize);
-          status_zx = stderr_socket_.read(
-              0, &captured_stderr_.front() + old_length, kBufferSize,
-              &bytes_read);
+          status_zx =
+              stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+                                  kBufferSize, &bytes_read);
           captured_stderr_.resize(old_length + bytes_read);
         } while (status_zx == ZX_OK);
         if (status_zx == ZX_ERR_PEER_CLOSED) {
@@ -987,13 +980,12 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Build the child process command line.
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|"
-      + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index);
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index);
   Arguments args;
   args.AddArguments(GetInjectableArgvs());
   args.AddArgument(filter_flag.c_str());
@@ -1016,8 +1008,7 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Create a socket pair will be used to receive the child process' stderr.
   zx::socket stderr_producer_socket;
-  status =
-      zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
   GTEST_DEATH_TEST_CHECK_(status >= 0);
   int stderr_producer_fd = -1;
   status =
@@ -1034,35 +1025,32 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Create a child job.
   zx_handle_t child_job = ZX_HANDLE_INVALID;
-  status = zx_job_create(zx_job_default(), 0, & child_job);
+  status = zx_job_create(zx_job_default(), 0, &child_job);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
   zx_policy_basic_t policy;
   policy.condition = ZX_POL_NEW_ANY;
   policy.policy = ZX_POL_ACTION_ALLOW;
-  status = zx_job_set_policy(
-      child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1);
+  status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+                             &policy, 1);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Create an exception channel attached to the |child_job|, to allow
   // us to suppress the system default exception handler from firing.
-  status =
-      zx_task_create_exception_channel(
-          child_job, 0, exception_channel_.reset_and_get_address());
+  status = zx_task_create_exception_channel(
+      child_job, 0, exception_channel_.reset_and_get_address());
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Spawn the child process.
-  status = fdio_spawn_etc(
-      child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr,
-      2, spawn_actions, child_process_.reset_and_get_address(), nullptr);
+  status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+                          args.Argv(), nullptr, 2, spawn_actions,
+                          child_process_.reset_and_get_address(), nullptr);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   set_spawned(true);
   return OVERSEE_TEST;
 }
 
-std::string FuchsiaDeathTest::GetErrorLogs() {
-  return captured_stderr_;
-}
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
 
 #else  // We are neither on Windows, nor on Fuchsia.
 
@@ -1093,8 +1081,7 @@ ForkingDeathTest::ForkingDeathTest(const char* a_statement,
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   ReadAndInterpretStatusByte();
 
@@ -1173,11 +1160,11 @@ class ExecDeathTest : public ForkingDeathTest {
  private:
   static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
     ::std::vector<std::string> args = GetInjectableArgvs();
-#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     ::std::vector<std::string> extra_args =
         GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
     args.insert(args.end(), extra_args.begin(), extra_args.end());
-#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     return args;
   }
   // The name of the file in which the death test is located.
@@ -1204,14 +1191,11 @@ class Arguments {
   template <typename Str>
   void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char* const* Argv() { return &args_[0]; }
 
  private:
   std::vector<char*> args_;
@@ -1224,9 +1208,9 @@ struct ExecDeathTestArgs {
   int close_fd;       // File descriptor to close; the read end of a pipe
 };
 
-#  if GTEST_OS_QNX
+#if GTEST_OS_QNX
 extern "C" char** environ;
-#  else  // GTEST_OS_QNX
+#else   // GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
@@ -1241,8 +1225,8 @@ static int ExecDeathTestChildMain(void* child_arg) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
@@ -1253,13 +1237,12 @@ static int ExecDeathTestChildMain(void* child_arg) {
   // one path separator.
   execv(args->argv[0], args->argv);
   DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " +
-                 GetLastErrnoDescription());
+                 original_dir + " failed: " + GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
-#  endif  // GTEST_OS_QNX
+#endif  // GTEST_OS_QNX
 
-#  if GTEST_HAS_CLONE
+#if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -1293,7 +1276,7 @@ static bool StackGrowsDown() {
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
-#  endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -1303,10 +1286,10 @@ static bool StackGrowsDown() {
 // spawn(2) there instead.  The function dies with an error message if
 // anything goes wrong.
 static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
+  ExecDeathTestArgs args = {argv, close_fd};
   pid_t child_pid = -1;
 
-#  if GTEST_OS_QNX
+#if GTEST_OS_QNX
   // Obtains the current directory and sets it to be closed in the child
   // process.
   const int cwd_fd = open(".", O_RDONLY);
@@ -1319,16 +1302,16 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
   int fd_flags;
   // Set close_fd to be closed after spawn.
   GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
-                                        fd_flags | FD_CLOEXEC));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
   struct inheritance inherit = {0};
   // spawn is a system call.
   child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
@@ -1336,8 +1319,8 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
 
-#  else   // GTEST_OS_QNX
-#   if GTEST_OS_LINUX
+#else  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   // When a SIGPROF signal is received while fork() or clone() are executing,
   // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
   // it after the call to fork()/clone() is complete.
@@ -1346,12 +1329,12 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
   sigemptyset(&ignore_sigprof_action.sa_mask);
   ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
-      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#   endif  // GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif  // GTEST_OS_LINUX
 
-#   if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+#if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG_GET(death_test_use_fork);
 
   if (!use_fork) {
     static const bool stack_grows_down = StackGrowsDown();
@@ -1370,7 +1353,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
     const size_t kMaxStackAlignment = 64;
     void* const stack_top =
         static_cast<char*>(stack) +
-            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+        (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
     GTEST_DEATH_TEST_CHECK_(
         static_cast<size_t>(stack_size) > kMaxStackAlignment &&
         reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
@@ -1379,19 +1362,19 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
 
     GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
   }
-#   else
+#else
   const bool use_fork = true;
-#   endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
   if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
+    ExecDeathTestChildMain(&args);
+    _exit(0);
   }
-#  endif  // GTEST_OS_QNX
-#  if GTEST_OS_LINUX
+#endif  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   GTEST_DEATH_TEST_CHECK_SYSCALL_(
       sigaction(SIGPROF, &saved_sigprof_action, nullptr));
-#  endif  // GTEST_OS_LINUX
+#endif  // GTEST_OS_LINUX
 
   GTEST_DEATH_TEST_CHECK_(child_pid != -1);
   return child_pid;
@@ -1420,13 +1403,13 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
 
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|" + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index) + "|"
-      + StreamableToString(pipe_fd[1]);
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    "internal_run_death_test=" + file_ + "|" +
+                                    StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index) + "|" +
+                                    StreamableToString(pipe_fd[1]);
   Arguments args;
   args.AddArguments(GetArgvsForDeathTestChildProcess());
   args.AddArgument(filter_flag.c_str());
@@ -1447,7 +1430,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // Creates a concrete DeathTest-derived class that depends on the
 // --gtest_death_test_style flag, and sets the pointer pointed to
@@ -1461,15 +1444,15 @@ bool DefaultDeathTestFactory::Create(const char* statement,
   UnitTestImpl* const impl = GetUnitTestImpl();
   const InternalRunDeathTestFlag* const flag =
       impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
+  const int death_test_index =
+      impl->current_test_info()->increment_death_test_count();
 
   if (flag != nullptr) {
     if (death_test_index > flag->index()) {
       DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index)
-          + ") somehow exceeded expected maximum ("
-          + StreamableToString(flag->index()) + ")");
+          "Death test count (" + StreamableToString(death_test_index) +
+          ") somehow exceeded expected maximum (" +
+          StreamableToString(flag->index()) + ")");
       return false;
     }
 
@@ -1480,50 +1463,50 @@ bool DefaultDeathTestFactory::Create(const char* statement,
     }
   }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
   }
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
   }
 
-# else
+#else
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe") {
     *test = new ExecDeathTest(statement, std::move(matcher), file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
+  } else if (GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new NoExecDeathTest(statement, std::move(matcher));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(
-        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
-        + "\" encountered");
+    DeathTest::set_last_death_test_message("Unknown death test style \"" +
+                                           GTEST_FLAG_GET(death_test_style) +
+                                           "\" encountered");
     return false;
   }
 
   return true;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
 static int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
+                                   size_t write_handle_as_size_t,
+                                   size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
+                                                 FALSE,  // Non-inheritable.
+                                                 parent_process_id));
   if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
     DeathTestAbort("Unable to open parent process " +
                    StreamableToString(parent_process_id));
@@ -1531,8 +1514,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
   // The newly initialized handle is accessible only in the parent
@@ -1554,9 +1536,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
   HANDLE dup_event_handle;
 
   if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
+                         ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
                          DUPLICATE_SAME_ACCESS)) {
     DeathTestAbort("Unable to duplicate the event handle " +
                    StreamableToString(event_handle_as_size_t) +
@@ -1578,61 +1558,57 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   return write_fd;
 }
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
 // initialized from the GTEST_FLAG(internal_run_death_test) flag if
 // the flag is specified; otherwise returns NULL.
 InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
+  if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr;
 
   // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
   // can use it here.
   int line = -1;
   int index = -1;
   ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields);
   int write_fd = -1;
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
   unsigned int parent_process_id = 0;
   size_t write_handle_as_size_t = 0;
   size_t event_handle_as_size_t = 0;
 
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &parent_process_id) ||
+      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
     DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
+  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
                                      event_handle_as_size_t);
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
-  if (fields.size() != 3
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
 
-# else
+#else
 
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
 }
diff --git a/third_party/googletest/src/src/gtest-filepath.cc b/third_party/googletest/src/src/gtest-filepath.cc
index 0b5629401b..f6ee90cdb7 100644
--- a/third_party/googletest/src/src/gtest-filepath.cc
+++ b/third_party/googletest/src/src/gtest-filepath.cc
@@ -30,29 +30,31 @@
 #include "gtest/internal/gtest-filepath.h"
 
 #include <stdlib.h>
-#include "gtest/internal/gtest-port.h"
+
 #include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
+#include <windows.h>
 #elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
+#include <direct.h>
+#include <io.h>
 #else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
+#include <limits.h>
+
+#include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif              // GTEST_OS_WINDOWS_MOBILE
 
 #include "gtest/internal/gtest-string.h"
 
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
+#define GTEST_PATH_MAX_ _MAX_PATH
 #elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
+#define GTEST_PATH_MAX_ PATH_MAX
 #elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
 #else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
 #endif  // GTEST_OS_WINDOWS
 
 namespace testing {
@@ -66,16 +68,16 @@ namespace internal {
 const char kPathSeparator = '\\';
 const char kAlternatePathSeparator = '/';
 const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 // Windows CE doesn't have a current directory. You should not use
 // the current directory in tests on Windows CE, but this at least
 // provides a reasonable fallback.
 const char kCurrentDirectoryString[] = "\\";
 // Windows CE doesn't define INVALID_FILE_ATTRIBUTES
 const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
+#else
 const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 #else
 const char kPathSeparator = '/';
 const char kCurrentDirectoryString[] = "./";
@@ -99,17 +101,17 @@ FilePath FilePath::GetCurrentDir() {
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
 #elif GTEST_OS_WINDOWS
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
   return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
 #else
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
   char* result = getcwd(cwd, sizeof(cwd));
-# if GTEST_OS_NACL
+#if GTEST_OS_NACL
   // getcwd will likely fail in NaCl due to the sandbox, so return something
   // reasonable. The user may have provided a shim implementation for getcwd,
   // however, so fallback only when failure is detected.
   return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
-# endif  // GTEST_OS_NACL
+#endif  // GTEST_OS_NACL
   return FilePath(result == nullptr ? "" : cwd);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -121,8 +123,8 @@ FilePath FilePath::GetCurrentDir() {
 FilePath FilePath::RemoveExtension(const char* extension) const {
   const std::string dot_extension = std::string(".") + extension;
   if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(pathname_.substr(
-        0, pathname_.length() - dot_extension.length()));
+    return FilePath(
+        pathname_.substr(0, pathname_.length() - dot_extension.length()));
   }
   return *this;
 }
@@ -178,15 +180,14 @@ FilePath FilePath::RemoveFileName() const {
 // than zero (e.g., 12), returns "dir/test_12.xml".
 // On Windows platform, uses \ as the separator rather than /.
 FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
+                                const FilePath& base_name, int number,
                                 const char* extension) {
   std::string file;
   if (number == 0) {
     file = base_name.string() + "." + extension;
   } else {
-    file = base_name.string() + "_" + StreamableToString(number)
-        + "." + extension;
+    file =
+        base_name.string() + "_" + StreamableToString(number) + "." + extension;
   }
   return ConcatPaths(directory, FilePath(file));
 }
@@ -195,8 +196,7 @@ FilePath FilePath::MakeFileName(const FilePath& directory,
 // On Windows, uses \ as the separator rather than /.
 FilePath FilePath::ConcatPaths(const FilePath& directory,
                                const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
+  if (directory.IsEmpty()) return relative_path;
   const FilePath dir(directory.RemoveTrailingPathSeparator());
   return FilePath(dir.string() + kPathSeparator + relative_path.string());
 }
@@ -207,7 +207,7 @@ bool FilePath::FileOrDirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   return attributes != kInvalidFileAttributes;
 #else
   posix::StatStruct file_stat{};
@@ -222,8 +222,8 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS
   // Don't strip off trailing separator if path is a root directory on
   // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
+  const FilePath& path(IsRootDirectory() ? *this
+                                         : RemoveTrailingPathSeparator());
 #else
   const FilePath& path(*this);
 #endif
@@ -231,15 +231,15 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   if ((attributes != kInvalidFileAttributes) &&
       (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
     result = true;
   }
 #else
   posix::StatStruct file_stat{};
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
+  result =
+      posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
   return result;
@@ -260,10 +260,9 @@ bool FilePath::IsAbsolutePath() const {
   const char* const name = pathname_.c_str();
 #if GTEST_OS_WINDOWS
   return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
+         ((name[0] >= 'a' && name[0] <= 'z') ||
+          (name[0] >= 'A' && name[0] <= 'Z')) &&
+         name[1] == ':' && IsPathSeparator(name[2]);
 #else
   return IsPathSeparator(name[0]);
 #endif
@@ -321,7 +320,7 @@ bool FilePath::CreateFolder() const {
   FilePath removed_sep(this->RemoveTrailingPathSeparator());
   LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
   int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
-  delete [] unicode;
+  delete[] unicode;
 #elif GTEST_OS_WINDOWS
   int result = _mkdir(pathname_.c_str());
 #elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
@@ -341,9 +340,8 @@ bool FilePath::CreateFolder() const {
 // name, otherwise return the name string unmodified.
 // On Windows platform, uses \ as the separator, other platforms use /.
 FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-      : *this;
+  return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+                       : *this;
 }
 
 // Removes any redundant separators that might be in the pathname.
diff --git a/third_party/googletest/src/src/gtest-internal-inl.h b/third_party/googletest/src/src/gtest-internal-inl.h
index 6d8cecbbb3..0b9e929c68 100644
--- a/third_party/googletest/src/src/gtest-internal-inl.h
+++ b/third_party/googletest/src/src/gtest-internal-inl.h
@@ -35,7 +35,7 @@
 #define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
 
 #ifndef _WIN32_WCE
-# include <errno.h>
+#include <errno.h>
 #endif  // !_WIN32_WCE
 #include <stddef.h>
 #include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
@@ -50,22 +50,20 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
+#include <arpa/inet.h>  // NOLINT
+#include <netdb.h>      // NOLINT
 #endif
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
+#include <windows.h>  // NOLINT
+#endif                // GTEST_OS_WINDOWS
 
-#include "gtest/gtest.h"
 #include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
-namespace testing {
-
 // Declares the flags.
 //
 // We don't want the users to modify this flag in the code, but want
@@ -73,32 +71,13 @@ namespace testing {
 // declare it here as opposed to in gtest.h.
 GTEST_DECLARE_bool_(death_test_use_fork);
 
+namespace testing {
 namespace internal {
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
 GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
 
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFailFast[] = "fail_fast";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kBriefFlag[] = "brief";
-const char kPrintTimeFlag[] = "print_time";
-const char kPrintUTF8Flag[] = "print_utf8";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-const char kFlagfileFlag[] = "flagfile";
-
 // A valid random seed must be in [1, kMaxRandomSeed].
 const int kMaxRandomSeed = 99999;
 
@@ -125,21 +104,21 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, int32_t* value);
+GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value);
 
 // Returns a random seed in range [1, kMaxRandomSeed] based on the
 // given --gtest_random_seed flag value.
 inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
+  const unsigned int raw_seed =
+      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+                              : static_cast<unsigned int>(random_seed_flag);
 
   // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
   // it's easy to type.
   const int normalized_seed =
       static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+                       static_cast<unsigned int>(kMaxRandomSeed)) +
+      1;
   return normalized_seed;
 }
 
@@ -160,50 +139,54 @@ class GTestFlagSaver {
  public:
   // The c'tor.
   GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    fail_fast_ = GTEST_FLAG(fail_fast);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    brief_ = GTEST_FLAG(brief);
-    print_time_ = GTEST_FLAG(print_time);
-    print_utf8_ = GTEST_FLAG(print_utf8);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+    also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG_GET(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions);
+    color_ = GTEST_FLAG_GET(color);
+    death_test_style_ = GTEST_FLAG_GET(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork);
+    fail_fast_ = GTEST_FLAG_GET(fail_fast);
+    filter_ = GTEST_FLAG_GET(filter);
+    internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test);
+    list_tests_ = GTEST_FLAG_GET(list_tests);
+    output_ = GTEST_FLAG_GET(output);
+    brief_ = GTEST_FLAG_GET(brief);
+    print_time_ = GTEST_FLAG_GET(print_time);
+    print_utf8_ = GTEST_FLAG_GET(print_utf8);
+    random_seed_ = GTEST_FLAG_GET(random_seed);
+    repeat_ = GTEST_FLAG_GET(repeat);
+    recreate_environments_when_repeating_ =
+        GTEST_FLAG_GET(recreate_environments_when_repeating);
+    shuffle_ = GTEST_FLAG_GET(shuffle);
+    stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG_GET(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure);
   }
 
   // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
   ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(fail_fast) = fail_fast_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(brief) = brief_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(print_utf8) = print_utf8_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+    GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_);
+    GTEST_FLAG_SET(break_on_failure, break_on_failure_);
+    GTEST_FLAG_SET(catch_exceptions, catch_exceptions_);
+    GTEST_FLAG_SET(color, color_);
+    GTEST_FLAG_SET(death_test_style, death_test_style_);
+    GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_);
+    GTEST_FLAG_SET(filter, filter_);
+    GTEST_FLAG_SET(fail_fast, fail_fast_);
+    GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_);
+    GTEST_FLAG_SET(list_tests, list_tests_);
+    GTEST_FLAG_SET(output, output_);
+    GTEST_FLAG_SET(brief, brief_);
+    GTEST_FLAG_SET(print_time, print_time_);
+    GTEST_FLAG_SET(print_utf8, print_utf8_);
+    GTEST_FLAG_SET(random_seed, random_seed_);
+    GTEST_FLAG_SET(repeat, repeat_);
+    GTEST_FLAG_SET(recreate_environments_when_repeating,
+                   recreate_environments_when_repeating_);
+    GTEST_FLAG_SET(shuffle, shuffle_);
+    GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_);
+    GTEST_FLAG_SET(stream_result_to, stream_result_to_);
+    GTEST_FLAG_SET(throw_on_failure, throw_on_failure_);
   }
 
  private:
@@ -224,6 +207,7 @@ class GTestFlagSaver {
   bool print_utf8_;
   int32_t random_seed_;
   int32_t repeat_;
+  bool recreate_environments_when_repeating_;
   bool shuffle_;
   int32_t stack_trace_depth_;
   std::string stream_result_to_;
@@ -278,8 +262,8 @@ GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val);
 // returns true if and only if the test should be run on this shard. The test id
 // is some arbitrary but unique non-negative integer assigned to each test
 // method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+                                     int test_id);
 
 // STL container utilities.
 
@@ -290,9 +274,8 @@ inline int CountIf(const Container& c, Predicate predicate) {
   // Implemented as an explicit loop since std::count_if() in libCstd on
   // Solaris has a non-standard signature.
   int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
+  for (auto it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it)) ++count;
   }
   return count;
 }
@@ -441,7 +424,9 @@ class OsStackTraceGetterInterface {
   static const char* const kElidedFramesMarker;
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+  OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete;
+  OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) =
+      delete;
 };
 
 // A working implementation of the OsStackTraceGetterInterface interface.
@@ -463,7 +448,8 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
   void* caller_frame_ = nullptr;
 #endif  // GTEST_HAS_ABSL
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+  OsStackTraceGetter(const OsStackTraceGetter&) = delete;
+  OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete;
 };
 
 // Information about a Google Test trace point.
@@ -476,7 +462,7 @@ struct TraceInfo {
 // This is the default global test part result reporter used in UnitTestImpl.
 // This class should only be used by UnitTestImpl.
 class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
+    : public TestPartResultReporterInterface {
  public:
   explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
   // Implements the TestPartResultReporterInterface. Reports the test part
@@ -486,7 +472,10 @@ class DefaultGlobalTestPartResultReporter
  private:
   UnitTestImpl* const unit_test_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+  DefaultGlobalTestPartResultReporter(
+      const DefaultGlobalTestPartResultReporter&) = delete;
+  DefaultGlobalTestPartResultReporter& operator=(
+      const DefaultGlobalTestPartResultReporter&) = delete;
 };
 
 // This is the default per thread test part result reporter used in
@@ -502,7 +491,10 @@ class DefaultPerThreadTestPartResultReporter
  private:
   UnitTestImpl* const unit_test_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+  DefaultPerThreadTestPartResultReporter(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
+  DefaultPerThreadTestPartResultReporter& operator=(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
 };
 
 // The private implementation of the UnitTest class.  We don't protect
@@ -640,7 +632,8 @@ class GTEST_API_ UnitTestImpl {
   // For example, if Foo() calls Bar(), which in turn calls
   // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
   // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+  std::string CurrentOsStackTraceExceptTop(int skip_count)
+      GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_;
 
   // Finds and returns a TestSuite with the given name.  If one doesn't
   // exist, creates one and returns it.
@@ -744,9 +737,7 @@ class GTEST_API_ UnitTestImpl {
   }
 
   // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
+  void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
 
   // Adds a TestProperty to the current TestResult object when invoked in a
   // context of a test or a test suite, or to the global property set. If the
@@ -754,10 +745,7 @@ class GTEST_API_ UnitTestImpl {
   // updated.
   void RecordProperty(const TestProperty& test_property);
 
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
+  enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
 
   // Matches the full name of each test against the user-specified
   // filter to decide whether the test should run, then records the
@@ -963,7 +951,8 @@ class GTEST_API_ UnitTestImpl {
   // starts.
   bool catch_exceptions_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+  UnitTestImpl(const UnitTestImpl&) = delete;
+  UnitTestImpl& operator=(const UnitTestImpl&) = delete;
 };  // class UnitTestImpl
 
 // Convenience function for accessing the global UnitTest
@@ -986,8 +975,9 @@ GTEST_API_ bool IsValidEscape(char ch);
 GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
 GTEST_API_ bool ValidateRegex(const char* regex);
 GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+                                              char repeat, const char* regex,
+                                              const char* str);
 GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
 
 #endif  // GTEST_USES_SIMPLE_RE
@@ -1089,8 +1079,7 @@ class StreamingListener : public EmptyTestEventListener {
     }
 
     ~SocketWriter() override {
-      if (sockfd_ != -1)
-        CloseConnection();
+      if (sockfd_ != -1) CloseConnection();
     }
 
     // Sends a string to the socket.
@@ -1100,9 +1089,8 @@ class StreamingListener : public EmptyTestEventListener {
 
       const auto len = static_cast<size_t>(message.length());
       if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
-        GTEST_LOG_(WARNING)
-            << "stream_result_to: failed to stream to "
-            << host_name_ << ":" << port_num_;
+        GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+                            << host_name_ << ":" << port_num_;
       }
     }
 
@@ -1123,7 +1111,8 @@ class StreamingListener : public EmptyTestEventListener {
     const std::string host_name_;
     const std::string port_num_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+    SocketWriter(const SocketWriter&) = delete;
+    SocketWriter& operator=(const SocketWriter&) = delete;
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
@@ -1135,7 +1124,9 @@ class StreamingListener : public EmptyTestEventListener {
   }
 
   explicit StreamingListener(AbstractSocketWriter* socket_writer)
-      : socket_writer_(socket_writer) { Start(); }
+      : socket_writer_(socket_writer) {
+    Start();
+  }
 
   void OnTestProgramStart(const UnitTest& /* unit_test */) override {
     SendLn("event=TestProgramStart");
@@ -1158,22 +1149,22 @@ class StreamingListener : public EmptyTestEventListener {
 
   void OnTestIterationEnd(const UnitTest& unit_test,
                           int /* iteration */) override {
-    SendLn("event=TestIterationEnd&passed=" +
-           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
-           StreamableToString(unit_test.elapsed_time()) + "ms");
+    SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+           "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+           "ms");
   }
 
   // Note that "event=TestCaseStart" is a wire format and has to remain
   // "case" for compatibility
-  void OnTestCaseStart(const TestCase& test_case) override {
-    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  void OnTestSuiteStart(const TestSuite& test_suite) override {
+    SendLn(std::string("event=TestCaseStart&name=") + test_suite.name());
   }
 
   // Note that "event=TestCaseEnd" is a wire format and has to remain
   // "case" for compatibility
-  void OnTestCaseEnd(const TestCase& test_case) override {
-    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
-           "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
+  void OnTestSuiteEnd(const TestSuite& test_suite) override {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) +
+           "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) +
            "ms");
   }
 
@@ -1183,8 +1174,7 @@ class StreamingListener : public EmptyTestEventListener {
 
   void OnTestEnd(const TestInfo& test_info) override {
     SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) +
-           "&elapsed_time=" +
+           FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
            StreamableToString((test_info.result())->elapsed_time()) + "ms");
   }
 
@@ -1208,7 +1198,8 @@ class StreamingListener : public EmptyTestEventListener {
 
   const std::unique_ptr<AbstractSocketWriter> socket_writer_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+  StreamingListener(const StreamingListener&) = delete;
+  StreamingListener& operator=(const StreamingListener&) = delete;
 };  // class StreamingListener
 
 #endif  // GTEST_CAN_STREAM_RESULTS_
diff --git a/third_party/googletest/src/src/gtest-matchers.cc b/third_party/googletest/src/src/gtest-matchers.cc
index 65104ebab1..7e3bcc0cff 100644
--- a/third_party/googletest/src/src/gtest-matchers.cc
+++ b/third_party/googletest/src/src/gtest-matchers.cc
@@ -32,12 +32,13 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-matchers.h"
 
 #include <string>
 
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
 namespace testing {
 
 // Constructs a matcher that matches a const std::string& whose value is
diff --git a/third_party/googletest/src/src/gtest-port.cc b/third_party/googletest/src/src/gtest-port.cc
index 53a4d37f97..d797fe4d58 100644
--- a/third_party/googletest/src/src/gtest-port.cc
+++ b/third_party/googletest/src/src/gtest-port.cc
@@ -27,61 +27,62 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gtest/internal/gtest-port.h"
 
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include <cstdint>
 #include <fstream>
 #include <memory>
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>
-# include <io.h>
-# include <sys/stat.h>
-# include <map>  // Used in ThreadLocal.
-# ifdef _MSC_VER
-#  include <crtdbg.h>
-# endif  // _MSC_VER
+#include <io.h>
+#include <sys/stat.h>
+#include <windows.h>
+
+#include <map>  // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif  // _MSC_VER
 #else
-# include <unistd.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
 #endif  // GTEST_OS_MAC
 
 #if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
     GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# include <sys/sysctl.h>
-# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
-#  include <sys/user.h>
-# endif
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
 #endif
 
 #if GTEST_OS_QNX
-# include <devctl.h>
-# include <fcntl.h>
-# include <sys/procfs.h>
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
 #endif  // GTEST_OS_QNX
 
 #if GTEST_OS_AIX
-# include <procinfo.h>
-# include <sys/types.h>
+#include <procinfo.h>
+#include <sys/types.h>
 #endif  // GTEST_OS_AIX
 
 #if GTEST_OS_FUCHSIA
-# include <zircon/process.h>
-# include <zircon/syscalls.h>
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
 #endif  // GTEST_OS_FUCHSIA
 
-#include "gtest/gtest-spi.h"
 #include "gtest/gtest-message.h"
+#include "gtest/gtest-spi.h"
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 #include "src/gtest-internal-inl.h"
@@ -89,16 +90,7 @@
 namespace testing {
 namespace internal {
 
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_LINUX
+#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD
 
 namespace {
 template <typename T>
@@ -131,8 +123,7 @@ size_t GetThreadCount() {
   if (status == KERN_SUCCESS) {
     // task_threads allocates resources in thread_list and we need to free them
     // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
+    vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
                   sizeof(thread_t) * thread_count);
     return static_cast<size_t>(thread_count);
   } else {
@@ -141,7 +132,7 @@ size_t GetThreadCount() {
 }
 
 #elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
-      GTEST_OS_NETBSD
+    GTEST_OS_NETBSD
 
 #if GTEST_OS_NETBSD
 #undef KERN_PROC
@@ -184,12 +175,12 @@ size_t GetThreadCount() {
 // we cannot detect it.
 size_t GetThreadCount() {
   int mib[] = {
-    CTL_KERN,
-    KERN_PROC,
-    KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
-    getpid(),
-    sizeof(struct kinfo_proc),
-    0,
+      CTL_KERN,
+      KERN_PROC,
+      KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+      getpid(),
+      sizeof(struct kinfo_proc),
+      0,
   };
   u_int miblen = sizeof(mib) / sizeof(mib[0]);
 
@@ -210,8 +201,7 @@ size_t GetThreadCount() {
   // exclude empty members
   size_t nthreads = 0;
   for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
-    if (info[i].p_tid != -1)
-      nthreads++;
+    if (info[i].p_tid != -1) nthreads++;
   }
   return nthreads;
 }
@@ -254,13 +244,9 @@ size_t GetThreadCount() {
 size_t GetThreadCount() {
   int dummy_buffer;
   size_t avail;
-  zx_status_t status = zx_object_get_info(
-      zx_process_self(),
-      ZX_INFO_PROCESS_THREADS,
-      &dummy_buffer,
-      0,
-      nullptr,
-      &avail);
+  zx_status_t status =
+      zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+                         &dummy_buffer, 0, nullptr, &avail);
   if (status == ZX_OK) {
     return avail;
   } else {
@@ -280,27 +266,15 @@ size_t GetThreadCount() {
 
 #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
-void SleepMilliseconds(int n) {
-  ::Sleep(static_cast<DWORD>(n));
-}
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
 
-AutoHandle::AutoHandle()
-    : handle_(INVALID_HANDLE_VALUE) {}
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
 
-AutoHandle::AutoHandle(Handle handle)
-    : handle_(handle) {}
+AutoHandle::~AutoHandle() { Reset(); }
 
-AutoHandle::~AutoHandle() {
-  Reset();
-}
-
-AutoHandle::Handle AutoHandle::Get() const {
-  return handle_;
-}
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
 
-void AutoHandle::Reset() {
-  Reset(INVALID_HANDLE_VALUE);
-}
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
 
 void AutoHandle::Reset(HANDLE handle) {
   // Resetting with the same handle we already own is invalid.
@@ -312,7 +286,7 @@ void AutoHandle::Reset(HANDLE handle) {
   } else {
     GTEST_CHECK_(!IsCloseable())
         << "Resetting a valid handle to itself is likely a programmer error "
-            "and thus not allowed.";
+           "and thus not allowed.";
   }
 }
 
@@ -322,23 +296,6 @@ bool AutoHandle::IsCloseable() const {
   return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
 }
 
-Notification::Notification()
-    : event_(::CreateEvent(nullptr,     // Default security attributes.
-                           TRUE,        // Do not reset automatically.
-                           FALSE,       // Initially unset.
-                           nullptr)) {  // Anonymous event.
-  GTEST_CHECK_(event_.Get() != nullptr);
-}
-
-void Notification::Notify() {
-  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
-}
-
-void Notification::WaitForNotification() {
-  GTEST_CHECK_(
-      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
-}
-
 Mutex::Mutex()
     : owner_thread_id_(0),
       type_(kDynamic),
@@ -391,25 +348,25 @@ namespace {
 //    MemoryIsNotDeallocated memory_is_not_deallocated;
 //    critical_section_ = new CRITICAL_SECTION;
 //
-class MemoryIsNotDeallocated
-{
+class MemoryIsNotDeallocated {
  public:
   MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
     old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
     // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
     // doesn't report mem leak if there's no matching deallocation.
-    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
   }
 
   ~MemoryIsNotDeallocated() {
     // Restore the original _CRTDBG_ALLOC_MEM_DF flag
-    _CrtSetDbgFlag(old_crtdbg_flag_);
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_);
   }
 
  private:
   int old_crtdbg_flag_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+  MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete;
+  MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete;
 };
 #endif  // _MSC_VER
 
@@ -435,15 +392,13 @@ void Mutex::ThreadSafeLazyInit() {
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
-        GTEST_CHECK_(::InterlockedCompareExchange(
-                          &critical_section_init_phase_, 2L, 1L) ==
-                      1L);
+        GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+                                                  2L, 1L) == 1L);
         break;
       case 1:
         // Somebody else is already initializing the mutex; spin until they
         // are done.
-        while (::InterlockedCompareExchange(&critical_section_init_phase_,
-                                            2L,
+        while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
                                             2L) != 2L) {
           // Possibly yields the rest of the thread's time slice to other
           // threads.
@@ -488,9 +443,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
  private:
   struct ThreadMainParam {
     ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
-        : runnable_(runnable),
-          thread_can_start_(thread_can_start) {
-    }
+        : runnable_(runnable), thread_can_start_(thread_can_start) {}
     std::unique_ptr<Runnable> runnable_;
     // Does not own.
     Notification* thread_can_start_;
@@ -508,20 +461,18 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
   // Prohibit instantiation.
   ThreadWithParamSupport();
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+  ThreadWithParamSupport(const ThreadWithParamSupport&) = delete;
+  ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete;
 };
 
 }  // namespace
 
-ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable,
                                          Notification* thread_can_start)
-      : thread_(ThreadWithParamSupport::CreateThread(runnable,
-                                                     thread_can_start)) {
-}
+    : thread_(
+          ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
 
-ThreadWithParamBase::~ThreadWithParamBase() {
-  Join();
-}
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
 
 void ThreadWithParamBase::Join() {
   GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
@@ -548,8 +499,10 @@ class ThreadLocalRegistryImpl {
     ThreadIdToThreadLocals::iterator thread_local_pos =
         thread_to_thread_locals->find(current_thread);
     if (thread_local_pos == thread_to_thread_locals->end()) {
-      thread_local_pos = thread_to_thread_locals->insert(
-          std::make_pair(current_thread, ThreadLocalValues())).first;
+      thread_local_pos =
+          thread_to_thread_locals
+              ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+              .first;
       StartWatcherThreadFor(current_thread);
     }
     ThreadLocalValues& thread_local_values = thread_local_pos->second;
@@ -577,9 +530,8 @@ class ThreadLocalRegistryImpl {
       ThreadIdToThreadLocals* const thread_to_thread_locals =
           GetThreadLocalsMapLocked();
       for (ThreadIdToThreadLocals::iterator it =
-          thread_to_thread_locals->begin();
-          it != thread_to_thread_locals->end();
-          ++it) {
+               thread_to_thread_locals->begin();
+           it != thread_to_thread_locals->end(); ++it) {
         ThreadLocalValues& thread_local_values = it->second;
         ThreadLocalValues::iterator value_pos =
             thread_local_values.find(thread_local_instance);
@@ -609,9 +561,8 @@ class ThreadLocalRegistryImpl {
       if (thread_local_pos != thread_to_thread_locals->end()) {
         ThreadLocalValues& thread_local_values = thread_local_pos->second;
         for (ThreadLocalValues::iterator value_pos =
-            thread_local_values.begin();
-            value_pos != thread_local_values.end();
-            ++value_pos) {
+                 thread_local_values.begin();
+             value_pos != thread_local_values.end(); ++value_pos) {
           value_holders.push_back(value_pos->second);
         }
         thread_to_thread_locals->erase(thread_local_pos);
@@ -637,9 +588,8 @@ class ThreadLocalRegistryImpl {
   static void StartWatcherThreadFor(DWORD thread_id) {
     // The returned handle will be kept in thread_map and closed by
     // watcher_thread in WatcherThreadFunc.
-    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
-                                 FALSE,
-                                 thread_id);
+    HANDLE thread =
+        ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
     GTEST_CHECK_(thread != nullptr);
     // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
@@ -650,7 +600,8 @@ class ThreadLocalRegistryImpl {
         &ThreadLocalRegistryImpl::WatcherThreadFunc,
         reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
         CREATE_SUSPENDED, &watcher_thread_id);
-    GTEST_CHECK_(watcher_thread != nullptr);
+    GTEST_CHECK_(watcher_thread != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
     // Give the watcher thread the same priority as ours to avoid being
     // blocked by it.
     ::SetThreadPriority(watcher_thread,
@@ -664,8 +615,7 @@ class ThreadLocalRegistryImpl {
   static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
     const ThreadIdAndHandle* tah =
         reinterpret_cast<const ThreadIdAndHandle*>(param);
-    GTEST_CHECK_(
-        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
     OnThreadExit(tah->first);
     ::CloseHandle(tah->second);
     delete tah;
@@ -689,16 +639,17 @@ class ThreadLocalRegistryImpl {
 };
 
 Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);  // NOLINT
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);  // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(
+    Mutex::kStaticMutex);  // NOLINT
 
 ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase* thread_local_instance) {
   return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
       thread_local_instance);
 }
 
 void ThreadLocalRegistry::OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase* thread_local_instance) {
   ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
 }
 
@@ -786,7 +737,7 @@ bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
 bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
 bool IsAsciiWordChar(char ch) {
   return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
+         ('0' <= ch && ch <= '9') || ch == '_';
 }
 
 // Returns true if and only if "\\c" is a supported escape sequence.
@@ -799,17 +750,28 @@ bool IsValidEscape(char c) {
 bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
   if (escaped) {  // "\\p" where p is pattern_char.
     switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
+      case 'd':
+        return IsAsciiDigit(ch);
+      case 'D':
+        return !IsAsciiDigit(ch);
+      case 'f':
+        return ch == '\f';
+      case 'n':
+        return ch == '\n';
+      case 'r':
+        return ch == '\r';
+      case 's':
+        return IsAsciiWhiteSpace(ch);
+      case 'S':
+        return !IsAsciiWhiteSpace(ch);
+      case 't':
+        return ch == '\t';
+      case 'v':
+        return ch == '\v';
+      case 'w':
+        return IsAsciiWordChar(ch);
+      case 'W':
+        return !IsAsciiWordChar(ch);
     }
     return IsAsciiPunct(pattern_char) && pattern_char == ch;
   }
@@ -820,7 +782,8 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 // Helper function used by ValidateRegex() to format error messages.
 static std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
+                    << " in simple regular expression \"" << regex << "\": ")
+      .GetString();
 }
 
 // Generates non-fatal failures and returns false if regex is invalid;
@@ -862,12 +825,12 @@ bool ValidateRegex(const char* regex) {
                       << "'$' can only appear at the end.";
         is_valid = false;
       } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' is unsupported.";
         is_valid = false;
       } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' can only follow a repeatable token.";
         is_valid = false;
       }
 
@@ -885,12 +848,10 @@ bool ValidateRegex(const char* regex) {
 // characters to be indexable by size_t, in which case the test will
 // probably time out anyway.  We are fine with this limitation as
 // std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+                                   const char* regex, const char* str) {
   const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
+  const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
   // We cannot call numeric_limits::max() as it conflicts with the
   // max() macro on Windows.
 
@@ -903,8 +864,7 @@ bool MatchRepetitionAndRegexAtHead(
       // greedy match.
       return true;
     }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
   }
   return false;
 }
@@ -918,25 +878,23 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
 
   // "$" only matches the end of a string.  Note that regex being
   // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
+  if (*regex == '$') return *str == '\0';
 
   // Is the first thing in regex an escape sequence?
   const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
+  if (escaped) ++regex;
   if (IsRepeat(regex[1])) {
     // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
     // here's an indirect recursion.  It terminates as the regex gets
     // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
+    return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+                                         str);
   } else {
     // regex isn't empty, isn't "$", and doesn't start with a
     // repetition.  We match the first atom of regex with the first
     // character of str and recurse.
     return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
+           MatchRegexAtHead(regex + 1, str + 1);
   }
 }
 
@@ -951,13 +909,11 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
 bool MatchRegexAnywhere(const char* regex, const char* str) {
   if (regex == nullptr || str == nullptr) return false;
 
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
+  if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
 
   // A successful match can be anywhere in str.
   do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
+    if (MatchRegexAtHead(regex, str)) return true;
   } while (*str++ != '\0');
   return false;
 }
@@ -1038,8 +994,8 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
 // FormatFileLocation in order to contrast the two functions.
 // Note that FormatCompilerIndependentFileLocation() does NOT append colon
 // to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line) {
   const std::string file_name(file == nullptr ? kUnknownFile : file);
 
   if (line < 0)
@@ -1050,12 +1006,13 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
 
 GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
     : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
+  const char* const marker = severity == GTEST_INFO      ? "[  INFO ]"
+                             : severity == GTEST_WARNING ? "[WARNING]"
+                             : severity == GTEST_ERROR   ? "[ ERROR ]"
+                                                         : "[ FATAL ]";
+  GetStream() << ::std::endl
+              << marker << " " << FormatFileLocation(file, line).c_str()
+              << ": ";
 }
 
 // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
@@ -1078,27 +1035,26 @@ class CapturedStream {
  public:
   // The ctor redirects the stream to a temporary file.
   explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+#if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = {'\0'};   // NOLINT
+    char temp_file_path[MAX_PATH + 1] = {'\0'};  // NOLINT
 
     ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
+    const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
                                             0,  // Generate unique file name.
                                             temp_file_path);
     GTEST_CHECK_(success != 0)
         << "Unable to create a temporary file in " << temp_dir_path;
     const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
+    GTEST_CHECK_(captured_fd != -1)
+        << "Unable to open temporary file " << temp_file_path;
     filename_ = temp_file_path;
-# else
+#else
     // There's no guarantee that a test has write access to the current
     // directory, so we create the temporary file in a temporary directory.
     std::string name_template;
 
-#  if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
     // Note: Android applications are expected to call the framework's
     // Context.getExternalStorageDirectory() method through JNI to get
     // the location of the world-writable SD Card directory. However,
@@ -1111,7 +1067,7 @@ class CapturedStream {
     // '/sdcard' and other variants cannot be relied on, as they are not
     // guaranteed to be mounted, or may have a delay in mounting.
     name_template = "/data/local/tmp/";
-#  elif GTEST_OS_IOS
+#elif GTEST_OS_IOS
     char user_temp_dir[PATH_MAX + 1];
 
     // Documented alternative to NSTemporaryDirectory() (for obtaining creating
@@ -1132,9 +1088,9 @@ class CapturedStream {
     name_template = user_temp_dir;
     if (name_template.back() != GTEST_PATH_SEP_[0])
       name_template.push_back(GTEST_PATH_SEP_[0]);
-#  else
+#else
     name_template = "/tmp/";
-#  endif
+#endif
     name_template.append("gtest_captured_stream.XXXXXX");
 
     // mkstemp() modifies the string bytes in place, and does not go beyond the
@@ -1150,15 +1106,13 @@ class CapturedStream {
           << " for test; does the test have access to the /tmp directory?";
     }
     filename_ = std::move(name_template);
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
     fflush(nullptr);
     dup2(captured_fd, fd_);
     close(captured_fd);
   }
 
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
+  ~CapturedStream() { remove(filename_.c_str()); }
 
   std::string GetCapturedString() {
     if (uncaptured_fd_ != -1) {
@@ -1185,7 +1139,8 @@ class CapturedStream {
   // Name of the temporary file holding the stderr output.
   ::std::string filename_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+  CapturedStream(const CapturedStream&) = delete;
+  CapturedStream& operator=(const CapturedStream&) = delete;
 };
 
 GTEST_DISABLE_MSC_DEPRECATED_POP_()
@@ -1213,6 +1168,15 @@ static std::string GetCapturedStream(CapturedStream** captured_stream) {
   return content;
 }
 
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+
 // Starts capturing stdout.
 void CaptureStdout() {
   CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
@@ -1235,10 +1199,6 @@ std::string GetCapturedStderr() {
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-
-
-
-
 size_t GetFileSize(FILE* file) {
   fseek(file, 0, SEEK_END);
   return static_cast<size_t>(ftell(file));
@@ -1256,7 +1216,8 @@ std::string ReadEntireFile(FILE* file) {
   // Keeps reading the file until we cannot read further or the
   // pre-determined file size is reached.
   do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_last_read =
+        fread(buffer + bytes_read, 1, file_size - bytes_read, file);
     bytes_read += bytes_last_read;
   } while (bytes_last_read > 0 && bytes_read < file_size);
 
@@ -1344,7 +1305,7 @@ bool ParseInt32(const Message& src_text, const char* str, int32_t* value) {
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
       // The parsed value overflows as an int32_t.
-      ) {
+  ) {
     Message msg;
     msg << "WARNING: " << src_text
         << " is expected to be a 32-bit integer, but actually"
@@ -1388,8 +1349,8 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
   }
 
   int32_t result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
+  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+                  &result)) {
     printf("The default value %s is used.\n",
            (Message() << default_value).GetString().c_str());
     fflush(stdout);
@@ -1408,7 +1369,7 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
 // not check that the flag is 'output'
 // In essence this checks an env variable called XML_OUTPUT_FILE
 // and if it is set we prepend "xml:" to its value, if it not set we return ""
-std::string OutputFlagAlsoCheckEnvVar(){
+std::string OutputFlagAlsoCheckEnvVar() {
   std::string default_value_for_output_flag = "";
   const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
   if (nullptr != xml_output_file_env) {
diff --git a/third_party/googletest/src/src/gtest-printers.cc b/third_party/googletest/src/src/gtest-printers.cc
index 1b68fcb500..f3976d230d 100644
--- a/third_party/googletest/src/src/gtest-printers.cc
+++ b/third_party/googletest/src/src/gtest-printers.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -101,7 +100,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
     PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
     *os << " ... ";
     // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
     PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
   }
   *os << ">";
@@ -136,11 +135,7 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
 //   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
 
 // Returns true if c is a printable ASCII character.  We test the
 // value of c directly instead of calling isprint(), which is buggy on
@@ -213,35 +208,21 @@ static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
   }
 }
 
-static const char* GetCharWidthPrefix(char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(char) { return ""; }
 
-static const char* GetCharWidthPrefix(signed char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(signed char) { return ""; }
 
-static const char* GetCharWidthPrefix(unsigned char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(unsigned char) { return ""; }
 
 #ifdef __cpp_char8_t
-static const char* GetCharWidthPrefix(char8_t) {
-  return "u8";
-}
+static const char* GetCharWidthPrefix(char8_t) { return "u8"; }
 #endif
 
-static const char* GetCharWidthPrefix(char16_t) {
-  return "u";
-}
+static const char* GetCharWidthPrefix(char16_t) { return "u"; }
 
-static const char* GetCharWidthPrefix(char32_t) {
-  return "U";
-}
+static const char* GetCharWidthPrefix(char32_t) { return "U"; }
 
-static const char* GetCharWidthPrefix(wchar_t) {
-  return "L";
-}
+static const char* GetCharWidthPrefix(wchar_t) { return "L"; }
 
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
@@ -276,8 +257,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
   // To aid user debugging, we also print c's code in decimal, unless
   // it's 0 (in which case c was printed as '\\0', making the code
   // obvious).
-  if (c == 0)
-    return;
+  if (c == 0) return;
   *os << " (" << static_cast<int>(c);
 
   // For more convenience, we print c's code again in hexadecimal,
@@ -304,17 +284,60 @@ void PrintTo(char32_t c, ::std::ostream* os) {
       << static_cast<uint32_t>(c);
 }
 
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+void PrintTo(__uint128_t v, ::std::ostream* os) {
+  if (v == 0) {
+    *os << "0";
+    return;
+  }
+
+  // Buffer large enough for ceil(log10(2^128))==39 and the null terminator
+  char buf[40];
+  char* p = buf + sizeof(buf);
+
+  // Some configurations have a __uint128_t, but no support for built in
+  // division. Do manual long division instead.
+
+  uint64_t high = static_cast<uint64_t>(v >> 64);
+  uint64_t low = static_cast<uint64_t>(v);
+
+  *--p = 0;
+  while (high != 0 || low != 0) {
+    uint64_t high_mod = high % 10;
+    high = high / 10;
+    // This is the long division algorithm specialized for a divisor of 10 and
+    // only two elements.
+    // Notable values:
+    //   2^64 / 10 == 1844674407370955161
+    //   2^64 % 10 == 6
+    const uint64_t carry = 6 * high_mod + low % 10;
+    low = low / 10 + high_mod * 1844674407370955161 + carry / 10;
+
+    char digit = static_cast<char>(carry % 10);
+    *--p = '0' + digit;
+  }
+  *os << p;
+}
+void PrintTo(__int128_t v, ::std::ostream* os) {
+  __uint128_t uv = static_cast<__uint128_t>(v);
+  if (v < 0) {
+    *os << "-";
+    uv = -uv;
+  }
+  PrintTo(uv, os);
+}
+#endif  // __SIZEOF_INT128__
+
 // Prints the given array of characters to the ostream.  CharType must be either
 // char, char8_t, char16_t, char32_t, or wchar_t.
 // The array starts at begin, the length is len, it may include '\0' characters
 // and may not be NUL-terminated.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static CharFormat PrintCharsAsStringTo(
-    const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+        PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) {
   const char* const quote_prefix = GetCharWidthPrefix(*begin);
   *os << quote_prefix << "\"";
   bool is_previous_hex = false;
@@ -340,12 +363,11 @@ static CharFormat PrintCharsAsStringTo(
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
 // 'begin'.  CharType must be either char or wchar_t.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void UniversalPrintCharArray(
-    const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+        UniversalPrintCharArray(const CharType* begin, size_t len,
+                                ostream* os) {
   // The code
   //   const char kFoo[] = "foo";
   // generates an array of 4, not 3, elements, with the last one being '\0'.
@@ -436,28 +458,28 @@ void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
 namespace {
 
 bool ContainsUnprintableControlCodes(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
 
   for (size_t i = 0; i < length; i++) {
     unsigned char ch = *s++;
     if (std::iscntrl(ch)) {
-        switch (ch) {
+      switch (ch) {
         case '\t':
         case '\n':
         case '\r':
           break;
         default:
           return true;
-        }
       }
+    }
   }
   return false;
 }
 
-bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
 
 bool IsValidUTF8(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
 
   for (size_t i = 0; i < length;) {
     unsigned char lead = s[i++];
@@ -470,15 +492,13 @@ bool IsValidUTF8(const char* str, size_t length) {
     } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
       ++i;  // 2-byte character
     } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
                // check for non-shortest form and surrogate
                (lead != 0xe0 || s[i] >= 0xa0) &&
                (lead != 0xed || s[i] < 0xa0)) {
       i += 2;  // 3-byte character
     } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
                IsUTF8TrailByte(s[i + 2]) &&
                // check for non-shortest form
                (lead != 0xf0 || s[i] >= 0x90) &&
@@ -502,7 +522,7 @@ void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
 
 void PrintStringTo(const ::std::string& s, ostream* os) {
   if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
-    if (GTEST_FLAG(print_utf8)) {
+    if (GTEST_FLAG_GET(print_utf8)) {
       ConditionalPrintAsText(s.data(), s.size(), os);
     }
   }
diff --git a/third_party/googletest/src/src/gtest-test-part.cc b/third_party/googletest/src/src/gtest-test-part.cc
index a938683ced..eb7c8d1cf9 100644
--- a/third_party/googletest/src/src/gtest-test-part.cc
+++ b/third_party/googletest/src/src/gtest-test-part.cc
@@ -51,13 +51,11 @@ std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
   return os << internal::FormatFileLocation(result.file_name(),
                                             result.line_number())
             << " "
-            << (result.type() == TestPartResult::kSuccess
-                    ? "Success"
-                    : result.type() == TestPartResult::kSkip
-                          ? "Skipped"
-                          : result.type() == TestPartResult::kFatalFailure
-                                ? "Fatal failure"
-                                : "Non-fatal failure")
+            << (result.type() == TestPartResult::kSuccess ? "Success"
+                : result.type() == TestPartResult::kSkip  ? "Skipped"
+                : result.type() == TestPartResult::kFatalFailure
+                    ? "Fatal failure"
+                    : "Non-fatal failure")
             << ":\n"
             << result.message() << std::endl;
 }
@@ -86,8 +84,8 @@ namespace internal {
 
 HasNewFatalFailureHelper::HasNewFatalFailureHelper()
     : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
+      original_reporter_(
+          GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
   GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
 }
 
@@ -98,8 +96,7 @@ HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
 
 void HasNewFatalFailureHelper::ReportTestPartResult(
     const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
+  if (result.fatally_failed()) has_new_fatal_failure_ = true;
   original_reporter_->ReportTestPartResult(result);
 }
 
diff --git a/third_party/googletest/src/src/gtest-typed-test.cc b/third_party/googletest/src/src/gtest-typed-test.cc
index c02c3df659..a2828b83c6 100644
--- a/third_party/googletest/src/src/gtest-typed-test.cc
+++ b/third_party/googletest/src/src/gtest-typed-test.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gtest/gtest-typed-test.h"
 
 #include "gtest/gtest.h"
@@ -38,8 +37,7 @@ namespace internal {
 // Skips to the first non-space char in str. Returns an empty string if str
 // contains only whitespace characters.
 static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
+  while (IsSpace(*str)) str++;
   return str;
 }
 
@@ -85,8 +83,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
   }
 
   for (RegisteredTestIter it = registered_tests_.begin();
-       it != registered_tests_.end();
-       ++it) {
+       it != registered_tests_.end(); ++it) {
     if (tests.count(it->first) == 0) {
       errors << "You forgot to list test " << it->first << ".\n";
     }
diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc
index 21c611aff1..6f31dd2260 100644
--- a/third_party/googletest/src/src/gtest.cc
+++ b/third_party/googletest/src/src/gtest.cc
@@ -31,8 +31,6 @@
 // The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest.h"
-#include "gtest/internal/custom/gtest.h"
-#include "gtest/gtest-spi.h"
 
 #include <ctype.h>
 #include <stdarg.h>
@@ -46,79 +44,87 @@
 #include <chrono>  // NOLINT
 #include <cmath>
 #include <cstdint>
+#include <initializer_list>
 #include <iomanip>
+#include <iterator>
 #include <limits>
 #include <list>
 #include <map>
 #include <ostream>  // NOLINT
 #include <sstream>
+#include <unordered_set>
 #include <vector>
 
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/custom/gtest.h"
+
 #if GTEST_OS_LINUX
 
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
+#include <fcntl.h>   // NOLINT
+#include <limits.h>  // NOLINT
+#include <sched.h>   // NOLINT
 // Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
+#include <strings.h>   // NOLINT
+#include <sys/mman.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+
+#include <string>
 
 #elif GTEST_OS_ZOS
-# include <sys/time.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
 
 // On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
+#include <strings.h>   // NOLINT
 
 #elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
 #elif GTEST_OS_WINDOWS  // We are on Windows proper.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
 #ifdef _MSC_VER
-# include <crtdbg.h>  // NOLINT
+#include <crtdbg.h>  // NOLINT
 #endif
 
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
+#include <io.h>         // NOLINT
+#include <sys/stat.h>   // NOLINT
+#include <sys/timeb.h>  // NOLINT
+#include <sys/types.h>  // NOLINT
 
-# if GTEST_OS_WINDOWS_MINGW
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS_MINGW
+#include <sys/time.h>  // NOLINT
+#endif                 // GTEST_OS_WINDOWS_MINGW
 
 #else
 
 // cpplint thinks that the header is already included, so we want to
 // silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
 
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-# include <sys/socket.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
+#include <arpa/inet.h>   // NOLINT
+#include <netdb.h>       // NOLINT
+#include <sys/socket.h>  // NOLINT
+#include <sys/types.h>   // NOLINT
 #endif
 
 #include "src/gtest-internal-inl.h"
 
 #if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
+#define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
@@ -131,7 +137,10 @@
 #include "absl/debugging/failure_signal_handler.h"
 #include "absl/debugging/stacktrace.h"
 #include "absl/debugging/symbolize.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
 #endif  // GTEST_HAS_ABSL
 
 namespace testing {
@@ -177,7 +186,7 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
 // is specified on the command line.
 bool g_help_flag = false;
 
-// Utilty function to Open File for Writing
+// Utility function to Open File for Writing
 static FILE* OpenFileForWriting(const std::string& output_file) {
   FILE* fileout = nullptr;
   FilePath output_file_path(output_file);
@@ -216,28 +225,33 @@ static bool GetDefaultFailFast() {
   return false;
 }
 
+}  // namespace testing
+
 GTEST_DEFINE_bool_(
-    fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()),
+    fail_fast,
+    testing::internal::BoolFromGTestEnv("fail_fast",
+                                        testing::GetDefaultFailFast()),
     "True if and only if a test failure should stop further test execution.");
 
 GTEST_DEFINE_bool_(
     also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false),
     "Run disabled tests too, in addition to the tests normally being run.");
 
 GTEST_DEFINE_bool_(
-    break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false),
+    break_on_failure,
+    testing::internal::BoolFromGTestEnv("break_on_failure", false),
     "True if and only if a failed assertion should be a debugger "
     "break-point.");
 
 GTEST_DEFINE_bool_(catch_exceptions,
-                   internal::BoolFromGTestEnv("catch_exceptions", true),
+                   testing::internal::BoolFromGTestEnv("catch_exceptions",
+                                                       true),
                    "True if and only if " GTEST_NAME_
                    " should catch exceptions and treat them as test failures.");
 
 GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
+    color, testing::internal::StringFromGTestEnv("color", "auto"),
     "Whether to use colors in the output.  Valid values: yes, no, "
     "and auto.  'auto' means to use colors if the output is "
     "being sent to a terminal and the TERM environment variable "
@@ -245,7 +259,8 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_string_(
     filter,
-    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    testing::internal::StringFromGTestEnv("filter",
+                                          testing::GetDefaultFilter()),
     "A colon-separated list of glob (not regex) patterns "
     "for filtering the tests to run, optionally followed by a "
     "'-' and a : separated list of negative patterns (tests to "
@@ -254,13 +269,14 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_bool_(
     install_failure_signal_handler,
-    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
-    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    testing::internal::BoolFromGTestEnv("install_failure_signal_handler",
+                                        false),
+    "If true and supported on the current platform, " GTEST_NAME_
+    " should "
     "install a signal handler that dumps debugging information when fatal "
     "signals are raised.");
 
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
 
 // The net priority order after flag processing is thus:
 //   --gtest_output command line flag
@@ -269,8 +285,8 @@ GTEST_DEFINE_bool_(list_tests, false,
 //   ''
 GTEST_DEFINE_string_(
     output,
-    internal::StringFromGTestEnv("output",
-      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    testing::internal::StringFromGTestEnv(
+        "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()),
     "A format (defaults to \"xml\" but can be specified to be \"json\"), "
     "optionally followed by a colon and an output file name or directory. "
     "A directory is indicated by a trailing pathname separator. "
@@ -281,65 +297,79 @@ GTEST_DEFINE_string_(
     "digits.");
 
 GTEST_DEFINE_bool_(
-    brief, internal::BoolFromGTestEnv("brief", false),
+    brief, testing::internal::BoolFromGTestEnv("brief", false),
     "True if only test failures should be displayed in text output.");
 
-GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
+GTEST_DEFINE_bool_(print_time,
+                   testing::internal::BoolFromGTestEnv("print_time", true),
                    "True if and only if " GTEST_NAME_
                    " should display elapsed time in text output.");
 
-GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true),
+GTEST_DEFINE_bool_(print_utf8,
+                   testing::internal::BoolFromGTestEnv("print_utf8", true),
                    "True if and only if " GTEST_NAME_
                    " prints UTF8 characters as text.");
 
 GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
+    random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0),
     "Random number seed to use when shuffling test orders.  Must be in range "
     "[1, 99999], or 0 to use a seed based on the current time.");
 
 GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
+    repeat, testing::internal::Int32FromGTestEnv("repeat", 1),
     "How many times to repeat each test.  Specify a negative number "
     "for repeating forever.  Useful for shaking out flaky tests.");
 
+GTEST_DEFINE_bool_(
+    recreate_environments_when_repeating,
+    testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating",
+                                        false),
+    "Controls whether global test environments are recreated for each repeat "
+    "of the tests. If set to false the global test environments are only set "
+    "up once, for the first iteration, and only torn down once, for the last. "
+    "Useful for shaking out flaky tests with stable, expensive test "
+    "environments. If --gtest_repeat is set to a negative number, meaning "
+    "there is no last run, the environments will always be recreated to avoid "
+    "leaks.");
+
 GTEST_DEFINE_bool_(show_internal_stack_frames, false,
                    "True if and only if " GTEST_NAME_
                    " should include internal stack frames when "
                    "printing test failure stack traces.");
 
-GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false),
+GTEST_DEFINE_bool_(shuffle,
+                   testing::internal::BoolFromGTestEnv("shuffle", false),
                    "True if and only if " GTEST_NAME_
                    " should randomize tests' order on every run.");
 
 GTEST_DEFINE_int32_(
     stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    testing::internal::Int32FromGTestEnv("stack_trace_depth",
+                                         testing::kMaxStackTraceDepth),
     "The maximum number of stack frames to print when an "
     "assertion fails.  The valid range is 0 through 100, inclusive.");
 
 GTEST_DEFINE_string_(
     stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
+    testing::internal::StringFromGTestEnv("stream_result_to", ""),
     "This flag specifies the host name and the port number on which to stream "
     "test results. Example: \"localhost:555\". The flag is effective only on "
     "Linux.");
 
 GTEST_DEFINE_bool_(
     throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
+    testing::internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
     "otherwise. For use with an external test framework.");
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 GTEST_DEFINE_string_(
-    flagfile,
-    internal::StringFromGTestEnv("flagfile", ""),
+    flagfile, testing::internal::StringFromGTestEnv("flagfile", ""),
     "This flag specifies the flagfile to read command-line flags from.");
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
+namespace testing {
 namespace internal {
 
 // Generates a random number from [0, range), using a Linear
@@ -348,10 +378,9 @@ namespace internal {
 uint32_t Random::Generate(uint32_t range) {
   // These constants are the same as are used in glibc's rand(3).
   // Use wider types than necessary to prevent unsigned overflow diagnostics.
-  state_ = static_cast<uint32_t>(1103515245ULL*state_ + 12345U) % kMaxRange;
+  state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
 
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
   GTEST_CHECK_(range <= kMaxRange)
       << "Generation of a number in [0, " << range << ") was requested, "
       << "but this can only generate numbers in [0, " << kMaxRange << ").";
@@ -396,32 +425,26 @@ static bool ShouldRunTestSuite(const TestSuite* test_suite) {
 }
 
 // AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
+AssertHelper::AssertHelper(TestPartResult::Type type, const char* file,
+                           int line, const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {}
 
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
+AssertHelper::~AssertHelper() { delete data_; }
 
 // Message assignment, for assertion streaming support.
 void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
+  UnitTest::GetInstance()->AddTestPartResult(
+      data_->type, data_->file, data_->line,
+      AppendUserMessage(data_->message, message),
+      UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+      // Skips the stack frame for this function itself.
+  );  // NOLINT
 }
 
 namespace {
 
 // When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
-// to creates test cases for it, a syntetic test case is
+// to creates test cases for it, a synthetic test case is
 // inserted to report ether an error or a log message.
 //
 // This configuration bit will likely be removed at some point.
@@ -452,7 +475,6 @@ class FailureTest : public Test {
   const bool as_error_;
 };
 
-
 }  // namespace
 
 std::set<std::string>* GetIgnoredParameterizedTestSuites() {
@@ -496,7 +518,8 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
       "To suppress this error for this test suite, insert the following line "
       "(in a non-header) in the namespace it is defined in:"
       "\n\n"
-      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");";
+      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+      name + ");";
 
   std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
   RegisterTest(  //
@@ -516,19 +539,18 @@ void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
 }
 
 void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) {
-  GetUnitTestImpl()
-      ->type_parameterized_test_registry()
-      .RegisterInstantiation(case_name);
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+      case_name);
 }
 
 void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
     const char* test_suite_name, CodeLocation code_location) {
   suites_.emplace(std::string(test_suite_name),
-                 TypeParameterizedTestSuiteInfo(code_location));
+                  TypeParameterizedTestSuiteInfo(code_location));
 }
 
 void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
-        const char* test_suite_name) {
+    const char* test_suite_name) {
   auto it = suites_.find(std::string(test_suite_name));
   if (it != suites_.end()) {
     it->second.instantiated = true;
@@ -606,7 +628,8 @@ FilePath GetCurrentExecutableName() {
 
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
   const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == nullptr)
              ? std::string(gtest_output_flag)
@@ -617,19 +640,19 @@ std::string UnitTestOptions::GetOutputFormat() {
 // Returns the name of the requested output file, or the default if none
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
 
   std::string format = GetOutputFormat();
-  if (format.empty())
-    format = std::string(kDefaultOutputFormat);
+  if (format.empty()) format = std::string(kDefaultOutputFormat);
 
   const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == nullptr)
     return internal::FilePath::MakeFileName(
-        internal::FilePath(
-            UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile), 0,
-        format.c_str()).string();
+               internal::FilePath(
+                   UnitTest::GetInstance()->original_working_dir()),
+               internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+        .string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
@@ -637,8 +660,7 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
         internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
         internal::FilePath(colon + 1));
 
-  if (!output_name.IsDirectory())
-    return output_name.string();
+  if (!output_name.IsDirectory()) return output_name.string();
 
   internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
       output_name, internal::GetCurrentExecutableName(),
@@ -699,59 +721,119 @@ static bool PatternMatchesString(const std::string& name_str,
   return true;
 }
 
-bool UnitTestOptions::MatchesFilter(const std::string& name_str,
-                                    const char* filter) {
-  // The filter is a list of patterns separated by colons (:).
-  const char* pattern = filter;
-  while (true) {
-    // Find the bounds of this pattern.
-    const char* const next_sep = strchr(pattern, ':');
-    const char* const pattern_end =
-        next_sep != nullptr ? next_sep : pattern + strlen(pattern);
-
-    // Check if this pattern matches name_str.
-    if (PatternMatchesString(name_str, pattern, pattern_end)) {
-      return true;
-    }
+namespace {
+
+bool IsGlobPattern(const std::string& pattern) {
+  return std::any_of(pattern.begin(), pattern.end(),
+                     [](const char c) { return c == '?' || c == '*'; });
+}
+
+class UnitTestFilter {
+ public:
+  UnitTestFilter() = default;
+
+  // Constructs a filter from a string of patterns separated by `:`.
+  explicit UnitTestFilter(const std::string& filter) {
+    // By design "" filter matches "" string.
+    std::vector<std::string> all_patterns;
+    SplitString(filter, ':', &all_patterns);
+    const auto exact_match_patterns_begin = std::partition(
+        all_patterns.begin(), all_patterns.end(), &IsGlobPattern);
+
+    glob_patterns_.reserve(static_cast<size_t>(
+        std::distance(all_patterns.begin(), exact_match_patterns_begin)));
+    std::move(all_patterns.begin(), exact_match_patterns_begin,
+              std::inserter(glob_patterns_, glob_patterns_.begin()));
+    std::move(
+        exact_match_patterns_begin, all_patterns.end(),
+        std::inserter(exact_match_patterns_, exact_match_patterns_.begin()));
+  }
+
+  // Returns true if and only if name matches at least one of the patterns in
+  // the filter.
+  bool MatchesName(const std::string& name) const {
+    return exact_match_patterns_.count(name) > 0 ||
+           std::any_of(glob_patterns_.begin(), glob_patterns_.end(),
+                       [&name](const std::string& pattern) {
+                         return PatternMatchesString(
+                             name, pattern.c_str(),
+                             pattern.c_str() + pattern.size());
+                       });
+  }
+
+ private:
+  std::vector<std::string> glob_patterns_;
+  std::unordered_set<std::string> exact_match_patterns_;
+};
 
-    // Give up on this pattern. However, if we found a pattern separator (:),
-    // advance to the next pattern (skipping over the separator) and restart.
-    if (next_sep == nullptr) {
-      return false;
+class PositiveAndNegativeUnitTestFilter {
+ public:
+  // Constructs a positive and a negative filter from a string. The string
+  // contains a positive filter optionally followed by a '-' character and a
+  // negative filter. In case only a negative filter is provided the positive
+  // filter will be assumed "*".
+  // A filter is a list of patterns separated by ':'.
+  explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) {
+    std::vector<std::string> positive_and_negative_filters;
+
+    // NOTE: `SplitString` always returns a non-empty container.
+    SplitString(filter, '-', &positive_and_negative_filters);
+    const auto& positive_filter = positive_and_negative_filters.front();
+
+    if (positive_and_negative_filters.size() > 1) {
+      positive_filter_ = UnitTestFilter(
+          positive_filter.empty() ? kUniversalFilter : positive_filter);
+
+      // TODO(b/214626361): Fail on multiple '-' characters
+      // For the moment to preserve old behavior we concatenate the rest of the
+      // string parts with `-` as separator to generate the negative filter.
+      auto negative_filter_string = positive_and_negative_filters[1];
+      for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++)
+        negative_filter_string =
+            negative_filter_string + '-' + positive_and_negative_filters[i];
+      negative_filter_ = UnitTestFilter(negative_filter_string);
+    } else {
+      // In case we don't have a negative filter and positive filter is ""
+      // we do not use kUniversalFilter by design as opposed to when we have a
+      // negative filter.
+      positive_filter_ = UnitTestFilter(positive_filter);
     }
-    pattern = next_sep + 1;
   }
-  return true;
+
+  // Returns true if and only if test name (this is generated by appending test
+  // suit name and test name via a '.' character) matches the positive filter
+  // and does not match the negative filter.
+  bool MatchesTest(const std::string& test_suite_name,
+                   const std::string& test_name) const {
+    return MatchesName(test_suite_name + "." + test_name);
+  }
+
+  // Returns true if and only if name matches the positive filter and does not
+  // match the negative filter.
+  bool MatchesName(const std::string& name) const {
+    return positive_filter_.MatchesName(name) &&
+           !negative_filter_.MatchesName(name);
+  }
+
+ private:
+  UnitTestFilter positive_filter_;
+  UnitTestFilter negative_filter_;
+};
+}  // namespace
+
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+                                    const char* filter) {
+  return UnitTestFilter(filter).MatchesName(name_str);
 }
 
 // Returns true if and only if the user-specified filter matches the test
 // suite name and the test name.
 bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
                                         const std::string& test_name) {
-  const std::string& full_name = test_suite_name + "." + test_name.c_str();
-
   // Split --gtest_filter at '-', if there is one, to separate into
   // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  std::string positive;
-  std::string negative;
-  if (dash == nullptr) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = "";
-  } else {
-    positive = std::string(p, dash);   // Everything up to the dash
-    negative = std::string(dash + 1);  // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
+  return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter))
+      .MatchesTest(test_suite_name, test_name);
 }
 
 #if GTEST_HAS_SEH
@@ -771,7 +853,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 
   bool should_handle = true;
 
-  if (!GTEST_FLAG(catch_exceptions))
+  if (!GTEST_FLAG_GET(catch_exceptions))
     should_handle = false;
   else if (exception_code == EXCEPTION_BREAKPOINT)
     should_handle = false;
@@ -789,8 +871,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 // results. Intercepts only failures from the current thread.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
     TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
   Init();
 }
 
@@ -799,8 +880,7 @@ ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
 // results.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
     InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
+    : intercept_mode_(intercept_mode), result_(result) {
   Init();
 }
 
@@ -844,9 +924,7 @@ namespace internal {
 // from user test code.  GetTestTypeId() is guaranteed to always
 // return the same value, as it always calls GetTypeId<>() from the
 // gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
@@ -861,9 +939,9 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
                                      const TestPartResultArray& results,
                                      TestPartResult::Type type,
                                      const std::string& substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
+  const std::string expected(type == TestPartResult::kFatalFailure
+                                 ? "1 fatal failure"
+                                 : "1 non-fatal failure");
   Message msg;
   if (results.size() != 1) {
     msg << "Expected: " << expected << "\n"
@@ -882,10 +960,10 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
   }
 
   if (strstr(r.message(), substr.c_str()) == nullptr) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
+    return AssertionFailure()
+           << "Expected: " << expected << " containing \"" << substr << "\"\n"
+           << "  Actual:\n"
+           << r;
   }
 
   return AssertionSuccess();
@@ -908,7 +986,8 @@ SingleFailureChecker::~SingleFailureChecker() {
 }
 
 DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
     const TestPartResult& result) {
@@ -917,7 +996,8 @@ void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
 }
 
 DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
     const TestPartResult& result) {
@@ -1024,11 +1104,10 @@ int UnitTestImpl::test_to_run_count() const {
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
 std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
   return os_stack_trace_getter()->CurrentStackTrace(
-      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
-      skip_count + 1
+      static_cast<int>(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1
       // Skips the user-specified number of frames plus this function
       // itself.
-      );  // NOLINT
+  );  // NOLINT
 }
 
 // A helper class for measuring elapsed times.
@@ -1072,8 +1151,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
   const int unicode_length =
       MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
   WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
+  MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
   unicode[unicode_length] = 0;
   return unicode;
 }
@@ -1082,7 +1160,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
 // memory using new. The caller is responsible for deleting the return
 // value using delete[]. Returns the ANSI string, or NULL if the
 // input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
   if (!utf16_str) return nullptr;
   const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
                                               0, nullptr, nullptr);
@@ -1101,7 +1179,7 @@ const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
 // Unlike strcmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
+bool String::CStringEquals(const char* lhs, const char* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -1115,11 +1193,10 @@ bool String::CStringEquals(const char * lhs, const char * rhs) {
 // encoding, and streams the result to the given Message object.
 static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
                                      Message* msg) {
-  for (size_t i = 0; i != length; ) {  // NOLINT
+  for (size_t i = 0; i != length;) {  // NOLINT
     if (wstr[i] != L'\0') {
       *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
+      while (i != length && wstr[i] != L'\0') i++;
     } else {
       *msg << '\0';
       i++;
@@ -1161,17 +1238,17 @@ Message::Message() : ss_(new ::std::stringstream) {
 
 // These two overloads allow streaming a wide C string to a Message
 // using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
+Message& Message::operator<<(const wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
-Message& Message::operator <<(wchar_t* wide_c_str) {
+Message& Message::operator<<(wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
 
 #if GTEST_HAS_STD_WSTRING
 // Converts the given wide string to a narrow string using the UTF-8
 // encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
+Message& Message::operator<<(const ::std::wstring& wstr) {
   internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
   return *this;
 }
@@ -1183,44 +1260,6 @@ std::string Message::GetString() const {
   return internal::StringStreamToString(ss_.get());
 }
 
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != nullptr
-                   ? new ::std::string(*other.message_)
-                   : static_cast< ::std::string*>(nullptr)) {}
-
-// Swaps two AssertionResults.
-void AssertionResult::swap(AssertionResult& other) {
-  using std::swap;
-  swap(success_, other.success_);
-  swap(message_, other.message_);
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != nullptr) negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
 namespace internal {
 
 namespace edit_distance {
@@ -1512,8 +1551,7 @@ std::vector<std::string> SplitEscapedString(const std::string& str) {
 AssertionResult EqFailure(const char* lhs_expression,
                           const char* rhs_expression,
                           const std::string& lhs_value,
-                          const std::string& rhs_value,
-                          bool ignoring_case) {
+                          const std::string& rhs_value, bool ignoring_case) {
   Message msg;
   msg << "Expected equality of these values:";
   msg << "\n  " << lhs_expression;
@@ -1530,10 +1568,8 @@ AssertionResult EqFailure(const char* lhs_expression,
   }
 
   if (!lhs_value.empty() && !rhs_value.empty()) {
-    const std::vector<std::string> lhs_lines =
-        SplitEscapedString(lhs_value);
-    const std::vector<std::string> rhs_lines =
-        SplitEscapedString(rhs_value);
+    const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
     if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
       msg << "\nWith diff:\n"
           << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
@@ -1545,27 +1581,21 @@ AssertionResult EqFailure(const char* lhs_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value) {
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value) {
   const char* actual_message = assertion_result.message();
   Message msg;
   msg << "Value of: " << expression_text
       << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
+  if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
   msg << "\nExpected: " << expected_predicate_value;
   return msg.GetString();
 }
 
 // Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
+AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2,
+                                     const char* abs_error_expr, double val1,
+                                     double val2, double abs_error) {
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
@@ -1595,20 +1625,17 @@ AssertionResult DoubleNearPredFormat(const char* expr1,
               "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
   }
   return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
+         << "The difference between " << expr1 << " and " << expr2 << " is "
+         << diff << ", which exceeds " << abs_error_expr << ", where\n"
+         << expr1 << " evaluates to " << val1 << ",\n"
+         << expr2 << " evaluates to " << val2 << ", and\n"
+         << abs_error_expr << " evaluates to " << abs_error << ".";
 }
 
-
 // Helper template for implementing FloatLE() and DoubleLE().
 template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
+AssertionResult FloatingPointLE(const char* expr1, const char* expr2,
+                                RawType val1, RawType val2) {
   // Returns success if val1 is less than val2,
   if (val1 < val2) {
     return AssertionSuccess();
@@ -1633,24 +1660,24 @@ AssertionResult FloatingPointLE(const char* expr1,
           << val2;
 
   return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
+         << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+         << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+         << StringStreamToString(&val2_ss);
 }
 
 }  // namespace internal
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
+AssertionResult FloatLE(const char* expr1, const char* expr2, float val1,
+                        float val2) {
   return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
 }
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
+AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1,
+                         double val2) {
   return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
 }
 
@@ -1658,62 +1685,51 @@ namespace internal {
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const char* lhs,
+                               const char* rhs_expression, const char* lhs,
                                const char* rhs) {
   if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const char* lhs,
+                                   const char* rhs_expression, const char* lhs,
                                    const char* rhs) {
   if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   true);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), true);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
+                               const char* s2_expression, const char* s1,
                                const char* s2) {
   if (!String::CStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
+                                   const char* s2_expression, const char* s1,
                                    const char* s2) {
   if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
     return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
@@ -1741,8 +1757,7 @@ bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
 
 // StringType here can be either ::std::string or ::std::wstring.
 template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
+bool IsSubstringPred(const StringType& needle, const StringType& haystack) {
   return haystack.find(needle) != StringType::npos;
 }
 
@@ -1751,21 +1766,22 @@ bool IsSubstringPred(const StringType& needle,
 // StringType here can be const char*, const wchar_t*, ::std::string,
 // or ::std::wstring.
 template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+                                const char* needle_expr,
+                                const char* haystack_expr,
+                                const StringType& needle,
+                                const StringType& haystack) {
   if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
     return AssertionSuccess();
 
   const bool is_wide_string = sizeof(needle[0]) > 1;
   const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
   return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
+         << "Value of: " << needle_expr << "\n"
+         << "  Actual: " << begin_string_quote << needle << "\"\n"
+         << "Expected: " << (expected_to_be_substring ? "" : "not ")
+         << "a substring of " << haystack_expr << "\n"
+         << "Which is: " << begin_string_quote << haystack << "\"";
 }
 
 }  // namespace
@@ -1774,52 +1790,52 @@ AssertionResult IsSubstringImpl(
 // substring of haystack (NULL is considered a substring of itself
 // only), and return an appropriate error message when they fail.
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const char* needle, const char* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const wchar_t* needle, const wchar_t* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const char* needle,
+                               const char* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const wchar_t* needle,
+                               const wchar_t* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::string& needle,
+                            const ::std::string& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::string& needle,
+                               const ::std::string& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
 #if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::wstring& needle,
+                            const ::std::wstring& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::wstring& needle,
+                               const ::std::wstring& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 #endif  // GTEST_HAS_STD_WSTRING
@@ -1831,43 +1847,42 @@ namespace internal {
 namespace {
 
 // Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
+AssertionResult HRESULTFailureHelper(const char* expr, const char* expected,
                                      long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
 
-# else
+#else
 
   // Looks up the human-readable system message for the HRESULT code
   // and since we're not passing any params to FormatMessage, we don't
   // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kFlags =
+      FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
   const DWORD kBufSize = 4096;
   // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
+  char error_text[kBufSize] = {'\0'};
   DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,   // no source, we're asking system
+                                          0,  // no source, we're asking system
                                           static_cast<DWORD>(hr),  // the error
-                                          0,   // no line width restrictions
+                                          0,  // no line width restrictions
                                           error_text,  // output buffer
                                           kBufSize,    // buf size
                                           nullptr);  // no arguments for inserts
   // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
   for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
+       --message_length) {
     error_text[message_length - 1] = '\0';
   }
 
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
   const std::string error_hex("0x" + String::FormatHexInt(hr));
   return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << " " << error_text << "\n";
+         << "Expected: " << expr << " " << expected << ".\n"
+         << "  Actual: " << error_hex << " " << error_text << "\n";
 }
 
 }  // namespace
@@ -1901,16 +1916,18 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 //  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 
 // The maximum code-point a one-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) <<  7) - 1;
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
 
 // The maximum code-point a two-byte UTF-8 sequence can represent.
 constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
 
 // The maximum code-point a three-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint3 = (static_cast<uint32_t>(1) << (4 + 2*6)) - 1;
+constexpr uint32_t kMaxCodePoint3 =
+    (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
 
 // The maximum code-point a four-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint4 = (static_cast<uint32_t>(1) << (3 + 3*6)) - 1;
+constexpr uint32_t kMaxCodePoint4 =
+    (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
 
 // Chops off the n lowest bits from a bit pattern.  Returns the n
 // lowest bits.  As a side effect, the original bit pattern will be
@@ -1935,7 +1952,7 @@ std::string CodePointToUtf8(uint32_t code_point) {
   char str[5];  // Big enough for the largest valid code point.
   if (code_point <= kMaxCodePoint1) {
     str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+    str[0] = static_cast<char>(code_point);  // 0xxxxxxx
   } else if (code_point <= kMaxCodePoint2) {
     str[2] = '\0';
     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
@@ -1963,8 +1980,8 @@ std::string CodePointToUtf8(uint32_t code_point) {
 // and thus should be combined into a single Unicode code point
 // using CreateCodePointFromUtf16SurrogatePair.
 inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+  return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+         (second & 0xFC00) == 0xDC00;
 }
 
 // Creates a Unicode code point from UTF16 surrogate pair.
@@ -1995,8 +2012,7 @@ inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
 std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
+  if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
 
   ::std::stringstream stream;
   for (int i = 0; i < num_chars; ++i) {
@@ -2005,8 +2021,8 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
     if (str[i] == L'\0') {
       break;
     } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
+      unicode_code_point =
+          CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
       i++;
     } else {
       unicode_code_point = static_cast<uint32_t>(str[i]);
@@ -2019,7 +2035,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
 
 // Converts a wide C string to an std::string using the UTF-8 encoding.
 // NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+std::string String::ShowWideCString(const wchar_t* wide_c_str) {
   if (wide_c_str == nullptr) return "(null)";
 
   return internal::WideStringToUtf8(wide_c_str, -1);
@@ -2031,7 +2047,7 @@ std::string String::ShowWideCString(const wchar_t * wide_c_str) {
 // Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -2041,33 +2057,27 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
 
 // Helper function for *_STREQ on wide strings.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const wchar_t* lhs,
+                               const char* rhs_expression, const wchar_t* lhs,
                                const wchar_t* rhs) {
   if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // Helper function for *_STRNE on wide strings.
 AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
+                               const char* s2_expression, const wchar_t* s1,
                                const wchar_t* s2) {
   if (!String::WideCStringEquals(s1, s2)) {
     return AssertionSuccess();
   }
 
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << PrintToString(s1)
-                            << " vs " << PrintToString(s2);
+  return AssertionFailure()
+         << "Expected: (" << s1_expression << ") != (" << s2_expression
+         << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
 }
 
 // Compares two C strings, ignoring case.  Returns true if and only if they have
@@ -2076,7 +2086,7 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 // Unlike strcasecmp(), this function can handle NULL argument(s).  A
 // NULL C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
   if (rhs == nullptr) return false;
   return posix::StrCaseCmp(lhs, rhs) == 0;
@@ -2118,8 +2128,8 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
 
 // Returns true if and only if str ends with the given suffix, ignoring case.
 // Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
-    const std::string& str, const std::string& suffix) {
+bool String::EndsWithCaseInsensitive(const std::string& str,
+                                     const std::string& suffix) {
   const size_t str_len = str.length();
   const size_t suffix_len = suffix.length();
   return (str_len >= suffix_len) &&
@@ -2202,15 +2212,13 @@ TestResult::TestResult()
     : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
 
 // D'tor.
-TestResult::~TestResult() {
-}
+TestResult::~TestResult() {}
 
 // Returns the i-th test part result among all the results. i can
 // range from 0 to total_part_count() - 1. If i is not in that range,
 // aborts the program.
 const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
+  if (i < 0 || i >= total_part_count()) internal::posix::Abort();
   return test_part_results_.at(static_cast<size_t>(i));
 }
 
@@ -2218,15 +2226,12 @@ const TestPartResult& TestResult::GetTestPartResult(int i) const {
 // test_property_count() - 1. If i is not in that range, aborts the
 // program.
 const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
+  if (i < 0 || i >= test_property_count()) internal::posix::Abort();
   return test_properties_.at(static_cast<size_t>(i));
 }
 
 // Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
 
 // Adds a test part result to the list.
 void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
@@ -2255,15 +2260,8 @@ void TestResult::RecordProperty(const std::string& xml_element,
 // The list of reserved attributes used in the <testsuites> element of XML
 // output.
 static const char* const kReservedTestSuitesAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "random_seed",
-  "tests",
-  "time",
-  "timestamp"
-};
+    "disabled",    "errors", "failures", "name",
+    "random_seed", "tests",  "time",     "timestamp"};
 
 // The list of reserved attributes used in the <testsuite> element of XML
 // output.
@@ -2273,8 +2271,8 @@ static const char* const kReservedTestSuiteAttributes[] = {
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
-    "classname",   "name", "status", "time",  "type_param",
-    "value_param", "file", "line"};
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
 
 // Use a slightly different set for allowed output to ensure existing tests can
 // still RecordProperty("result") or "RecordProperty(timestamp")
@@ -2336,7 +2334,7 @@ static bool ValidateTestPropertyName(
     const std::string& property_name,
     const std::vector<std::string>& reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-          reserved_names.end()) {
+      reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
                   << " (" << FormatWordList(reserved_names)
                   << " are reserved by " << GTEST_NAME_ << ")";
@@ -2374,8 +2372,7 @@ bool TestResult::Skipped() const {
 // Returns true if and only if the test failed.
 bool TestResult::Failed() const {
   for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
+    if (GetTestPartResult(i).failed()) return true;
   }
   return false;
 }
@@ -2416,27 +2413,22 @@ int TestResult::test_property_count() const {
 // Creates a Test object.
 
 // The c'tor saves the states of all flags.
-Test::Test()
-    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
-}
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
 
 // The d'tor restores the states of all flags.  The actual work is
 // done by the d'tor of the gtest_flag_saver_ field, and thus not
 // visible here.
-Test::~Test() {
-}
+Test::~Test() {}
 
 // Sets up the test fixture.
 //
 // A sub-class may override this.
-void Test::SetUp() {
-}
+void Test::SetUp() {}
 
 // Tears down the test fixture.
 //
 // A sub-class may override this.
-void Test::TearDown() {
-}
+void Test::TearDown() {}
 
 // Allows user supplied key value pairs to be recorded for later output.
 void Test::RecordProperty(const std::string& key, const std::string& value) {
@@ -2541,8 +2533,8 @@ bool Test::HasSameFixtureClass() {
 static std::string* FormatSehExceptionMessage(DWORD exception_code,
                                               const char* location) {
   Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
+  message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+          << std::setbase(10) << " thrown in " << location << ".";
 
   return new std::string(message.GetString());
 }
@@ -2585,8 +2577,8 @@ GoogleTestFailureException::GoogleTestFailureException(
 // exceptions in the same function.  Therefore, we provide a separate
 // wrapper function for handling SEH exceptions.)
 template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                              const char* location) {
 #if GTEST_HAS_SEH
   __try {
     return (object->*method)();
@@ -2595,8 +2587,8 @@ Result HandleSehExceptionsInMethodIfSupported(
     // We create the exception message on the heap because VC++ prohibits
     // creation of objects with destructors on stack in functions using __try
     // (see error C2712).
-    std::string* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
+    std::string* exception_message =
+        FormatSehExceptionMessage(GetExceptionCode(), location);
     internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
                                              *exception_message);
     delete exception_message;
@@ -2612,8 +2604,8 @@ Result HandleSehExceptionsInMethodIfSupported(
 // exceptions, if they are supported; returns the 0-value for type
 // Result in case of an SEH exception.
 template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                           const char* location) {
   // NOTE: The user code can affect the way in which Google Test handles
   // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
   // RUN_ALL_TESTS() starts. It is technically possible to check the flag
@@ -2623,7 +2615,7 @@ Result HandleExceptionsInMethodIfSupported(
   // try {
   //   // Perform the test method.
   // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
+  //   if (GTEST_FLAG_GET(catch_exceptions))
   //     // Report the exception as failure.
   //   else
   //     throw;  // Re-throws the original exception.
@@ -2679,16 +2671,16 @@ void Test::Run() {
   // GTEST_SKIP().
   if (!HasFatalFailure() && !IsSkipped()) {
     impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
+    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+                                                  "the test body");
   }
 
   // However, we want to clean up as much as possible.  Hence we will
   // always call TearDown(), even if SetUp() or the test body has
   // failed.
   impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+                                                "TearDown()");
 }
 
 // Returns true if and only if the current test has a fatal failure.
@@ -2698,8 +2690,9 @@ bool Test::HasFatalFailure() {
 
 // Returns true if and only if the current test has a non-fatal failure.
 bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
+  return internal::GetUnitTestImpl()
+      ->current_test_result()
+      ->HasNonfatalFailure();
 }
 
 // Returns true if and only if the current test was skipped.
@@ -2799,11 +2792,10 @@ class TestNameIs {
   // Constructor.
   //
   // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
+  explicit TestNameIs(const char* name) : name_(name) {}
 
   // Returns true if and only if the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
+  bool operator()(const TestInfo* test_info) const {
     return test_info && test_info->name() == name_;
   }
 
@@ -2831,20 +2823,20 @@ void UnitTestImpl::RegisterParameterizedTests() {
 // Creates the test object, runs it, records its result, and then
 // deletes it.
 void TestInfo::Run() {
-  if (!should_run_) return;
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+  if (!should_run_) {
+    if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this);
+    return;
+  }
 
   // Tells UnitTest where to store test result.
   internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   impl->set_current_test_info(this);
 
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
   // Notifies the unit test event listeners that a test is about to start.
   repeater->OnTestStart(*this);
-
   result_.set_start_timestamp(internal::GetTimeInMillis());
   internal::Timer timer;
-
   impl->os_stack_trace_getter()->UponLeavingGTest();
 
   // Creates the test object.
@@ -3009,11 +3001,18 @@ void TestSuite::Run() {
   internal::HandleExceptionsInMethodIfSupported(
       this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 
+  const bool skip_all = ad_hoc_test_result().Failed();
+
   start_timestamp_ = internal::GetTimeInMillis();
   internal::Timer timer;
   for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
-    if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) {
+    if (skip_all) {
+      GetMutableTestInfo(i)->Skip();
+    } else {
+      GetMutableTestInfo(i)->Run();
+    }
+    if (GTEST_FLAG_GET(fail_fast) &&
+        GetMutableTestInfo(i)->result()->Failed()) {
       for (int j = i + 1; j < total_test_count(); j++) {
         GetMutableTestInfo(j)->Skip();
       }
@@ -3089,11 +3088,10 @@ void TestSuite::UnshuffleTests() {
 //
 // FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
 // FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
-                                       const char * singular_form,
-                                       const char * plural_form) {
+static std::string FormatCountableNoun(int count, const char* singular_form,
+                                       const char* plural_form) {
   return internal::StreamableToString(count) + " " +
-      (count == 1 ? singular_form : plural_form);
+         (count == 1 ? singular_form : plural_form);
 }
 
 // Formats the count of tests.
@@ -3110,7 +3108,7 @@ static std::string FormatTestSuiteCount(int test_suite_count) {
 // representation.  Both kNonFatalFailure and kFatalFailure are translated
 // to "Failure", as the user usually doesn't care about the difference
 // between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+static const char* TestPartResultTypeToString(TestPartResult::Type type) {
   switch (type) {
     case TestPartResult::kSkip:
       return "Skipped\n";
@@ -3137,17 +3135,18 @@ enum class GTestColor { kDefault, kRed, kGreen, kYellow };
 // Prints a TestPartResult to an std::string.
 static std::string PrintTestPartResultToString(
     const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
+  return (Message() << internal::FormatFileLocation(
+                           test_part_result.file_name(),
+                           test_part_result.line_number())
+                    << " "
+                    << TestPartResultTypeToString(test_part_result.type())
+                    << test_part_result.message())
+      .GetString();
 }
 
 // Prints a TestPartResult.
 static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const std::string& result =
-      PrintTestPartResultToString(test_part_result);
+  const std::string& result = PrintTestPartResultToString(test_part_result);
   printf("%s\n", result.c_str());
   fflush(stdout);
   // If the test program runs in Visual Studio or a debugger, the
@@ -3164,8 +3163,8 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) {
 }
 
 // class PrettyUnitTestResultPrinter
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
 static WORD GetColorAttribute(GTestColor color) {
@@ -3176,7 +3175,8 @@ static WORD GetColorAttribute(GTestColor color) {
       return FOREGROUND_GREEN;
     case GTestColor::kYellow:
       return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
+    default:
+      return 0;
   }
 }
 
@@ -3232,7 +3232,8 @@ static const char* GetAnsiColorCode(GTestColor color) {
 
 // Returns true if and only if Google Test should use colors in the output.
 bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
+  std::string c = GTEST_FLAG_GET(color);
+  const char* const gtest_color = c.c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
@@ -3259,9 +3260,9 @@ bool ShouldUseColor(bool stdout_is_tty) {
   }
 
   return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
+         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+         String::CStringEquals(gtest_color, "1");
   // We take "yes", "true", "t", and "1" as meaning "yes".  If the
   // value is neither one of these nor "auto", we treat it as "no" to
   // be conservative.
@@ -3273,18 +3274,13 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // that would be colored when printed, as can be done on Linux.
 
 GTEST_ATTRIBUTE_PRINTF_(2, 3)
-static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
-    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
-  const bool use_color = AlwaysFalse();
-#else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
   const bool use_color = in_color_mode && (color != GTestColor::kDefault);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
 
   if (!use_color) {
     vprintf(fmt, args);
@@ -3292,8 +3288,8 @@ static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
     return;
   }
 
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
@@ -3364,6 +3360,7 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
 #endif  // OnTestCaseStart
 
   void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
 
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
@@ -3384,13 +3381,14 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
   static void PrintSkippedTests(const UnitTest& unit_test);
 };
 
-  // Fired before each iteration of tests starts.
+// Fired before each iteration of tests starts.
 void PrettyUnitTestResultPrinter::OnTestIterationStart(
     const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
+  if (GTEST_FLAG_GET(repeat) != 1)
     printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
 
-  const char* const filter = GTEST_FLAG(filter).c_str();
+  std::string f = GTEST_FLAG_GET(filter);
+  const char* const filter = f.c_str();
 
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
@@ -3406,7 +3404,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
                   internal::posix::GetEnv(kTestTotalShards));
   }
 
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     ColoredPrintf(GTestColor::kYellow,
                   "Note: Randomizing tests' orders with a seed of %d .\n",
                   unit_test.random_seed());
@@ -3462,6 +3460,13 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
   fflush(stdout);
 }
 
+void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) {
+  ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
 // Called after an assertion failure.
 void PrettyUnitTestResultPrinter::OnTestPartResult(
     const TestPartResult& result) {
@@ -3486,12 +3491,12 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
     ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   }
   PrintTestName(test_info.test_suite_name(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
+  if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
 
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
+  if (GTEST_FLAG_GET(print_time)) {
+    printf(" (%s ms)\n",
+           internal::StreamableToString(test_info.result()->elapsed_time())
+               .c_str());
   } else {
     printf("\n");
   }
@@ -3500,7 +3505,7 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
 
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
-  if (!GTEST_FLAG(print_time)) return;
+  if (!GTEST_FLAG_GET(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
@@ -3511,7 +3516,7 @@ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
 }
 #else
 void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
-  if (!GTEST_FLAG(print_time)) return;
+  if (!GTEST_FLAG_GET(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
@@ -3607,7 +3612,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
+  if (GTEST_FLAG_GET(print_time)) {
     printf(" (%s ms total)",
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
@@ -3628,7 +3633,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   }
 
   int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
@@ -3664,6 +3669,7 @@ class BriefUnitTestResultPrinter : public TestEventListener {
 #endif  // OnTestCaseStart
 
   void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
 
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
@@ -3700,7 +3706,7 @@ void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
     PrintTestName(test_info.test_suite_name(), test_info.name());
     PrintFullTestCommentIfPresent(test_info);
 
-    if (GTEST_FLAG(print_time)) {
+    if (GTEST_FLAG_GET(print_time)) {
       printf(" (%s ms)\n",
              internal::StreamableToString(test_info.result()->elapsed_time())
                  .c_str());
@@ -3717,7 +3723,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
+  if (GTEST_FLAG_GET(print_time)) {
     printf(" (%s ms total)",
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
@@ -3732,7 +3738,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   }
 
   int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
@@ -3752,7 +3758,7 @@ class TestEventRepeater : public TestEventListener {
  public:
   TestEventRepeater() : forwarding_enabled_(true) {}
   ~TestEventRepeater() override;
-  void Append(TestEventListener *listener);
+  void Append(TestEventListener* listener);
   TestEventListener* Release(TestEventListener* listener);
 
   // Controls whether events will be forwarded to listeners_. Set to false
@@ -3770,6 +3776,7 @@ class TestEventRepeater : public TestEventListener {
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   void OnTestSuiteStart(const TestSuite& parameter) override;
   void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
 //  Legacy API is deprecated but still available
@@ -3789,18 +3796,19 @@ class TestEventRepeater : public TestEventListener {
   // The list of listeners that receive events.
   std::vector<TestEventListener*> listeners_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+  TestEventRepeater(const TestEventRepeater&) = delete;
+  TestEventRepeater& operator=(const TestEventRepeater&) = delete;
 };
 
 TestEventRepeater::~TestEventRepeater() {
   ForEach(listeners_, Delete<TestEventListener>);
 }
 
-void TestEventRepeater::Append(TestEventListener *listener) {
+void TestEventRepeater::Append(TestEventListener* listener) {
   listeners_.push_back(listener);
 }
 
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+TestEventListener* TestEventRepeater::Release(TestEventListener* listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
       listeners_.erase(listeners_.begin() + static_cast<int>(i));
@@ -3813,14 +3821,14 @@ TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
 
 // Since most methods are very similar, use macros to reduce boilerplate.
 // This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
+#define GTEST_REPEATER_METHOD_(Name, Type)              \
+  void TestEventRepeater::Name(const Type& parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = 0; i < listeners_.size(); i++) {  \
+        listeners_[i]->Name(parameter);                 \
+      }                                                 \
+    }                                                   \
+  }
 // This defines a member that forwards the call to all listeners in reverse
 // order.
 #define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
@@ -3840,6 +3848,7 @@ GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
 GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo)
 GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
 GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
 GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
@@ -3890,12 +3899,13 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
  private:
   // Is c a whitespace character that is normalized to a space character
   // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
+  static bool IsNormalizableWhitespace(unsigned char c) {
+    return c == '\t' || c == '\n' || c == '\r';
   }
 
   // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
+  // https://www.w3.org/TR/REC-xml/#charsets
+  static bool IsValidXmlCharacter(unsigned char c) {
     return IsNormalizableWhitespace(c) || c >= 0x20;
   }
 
@@ -3965,7 +3975,8 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // The output file.
   const std::string output_file_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+  XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete;
+  XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete;
 };
 
 // Creates a new XmlUnitTestResultPrinter.
@@ -4005,8 +4016,8 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-std::string XmlUnitTestResultPrinter::EscapeXml(
-    const std::string& str, bool is_attribute) {
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str,
+                                                bool is_attribute) {
   Message m;
 
   for (size_t i = 0; i < str.size(); ++i) {
@@ -4034,8 +4045,9 @@ std::string XmlUnitTestResultPrinter::EscapeXml(
           m << '"';
         break;
       default:
-        if (IsValidXmlCharacter(ch)) {
-          if (is_attribute && IsNormalizableWhitespace(ch))
+        if (IsValidXmlCharacter(static_cast<unsigned char>(ch))) {
+          if (is_attribute &&
+              IsNormalizableWhitespace(static_cast<unsigned char>(ch)))
             m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
               << ";";
           else
@@ -4056,7 +4068,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
   std::string output;
   output.reserve(str.size());
   for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
+    if (IsValidXmlCharacter(static_cast<unsigned char>(*it)))
       output.push_back(*it);
 
   return output;
@@ -4064,7 +4076,6 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 
 // The following routines generate an XML representation of a UnitTest
 // object.
-// GOOGLETEST_CM0009 DO NOT DELETE
 //
 // This is how Google Test concepts map to the DTD:
 //
@@ -4113,12 +4124,12 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
     return "";
   // YYYY-MM-DDThh:mm:ss.sss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec) + "." +
-      String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "." +
+         String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
 }
 
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
@@ -4129,8 +4140,8 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
   for (;;) {
     const char* const next_segment = strstr(segment, "]]>");
     if (next_segment != nullptr) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
+      stream->write(segment,
+                    static_cast<std::streamsize>(next_segment - segment));
       *stream << "]]>]]&gt;<![CDATA[";
       segment = next_segment + strlen("]]>");
     } else {
@@ -4142,15 +4153,13 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
 }
 
 void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value) {
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, const std::string& value) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Attribute " << name << " is not allowed for element <" << element_name
       << ">.";
 
@@ -4216,10 +4225,11 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
     OutputXmlAttribute(stream, kTestsuite, "type_param",
                        test_info.type_param());
   }
-  if (GTEST_FLAG(list_tests)) {
-    OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
-    OutputXmlAttribute(stream, kTestsuite, "line",
-                       StreamableToString(test_info.line()));
+
+  OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+  OutputXmlAttribute(stream, kTestsuite, "line",
+                     StreamableToString(test_info.line()));
+  if (GTEST_FLAG_GET(list_tests)) {
     *stream << " />\n";
     return;
   }
@@ -4254,8 +4264,7 @@ void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
                                                           part.line_number());
       const std::string summary = location + "\n" + part.summary();
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary)
+      *stream << "      <failure message=\"" << EscapeXmlAttribute(summary)
               << "\" type=\"\">";
       const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
@@ -4295,7 +4304,7 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
   OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
   OutputXmlAttribute(stream, kTestsuite, "tests",
                      StreamableToString(test_suite.reportable_test_count()));
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputXmlAttribute(stream, kTestsuite, "failures",
                        StreamableToString(test_suite.failed_test_count()));
     OutputXmlAttribute(
@@ -4343,7 +4352,7 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
       stream, kTestsuites, "timestamp",
       FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
 
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     OutputXmlAttribute(stream, kTestsuites, "random_seed",
                        StreamableToString(unit_test.random_seed()));
   }
@@ -4396,7 +4405,7 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
     attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+               << "\"" << EscapeXmlAttribute(property.value()) << "\"";
   }
   return attributes.GetString();
 }
@@ -4410,15 +4419,15 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties(
     return;
   }
 
-  *stream << "<" << kProperties << ">\n";
+  *stream << "      <" << kProperties << ">\n";
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
-    *stream << "<" << kProperty;
+    *stream << "        <" << kProperty;
     *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
     *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
     *stream << "/>\n";
   }
-  *stream << "</" << kProperties << ">\n";
+  *stream << "      </" << kProperties << ">\n";
 }
 
 // End XmlUnitTestResultPrinter
@@ -4442,16 +4451,12 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
   //// streams the attribute as JSON.
   static void OutputJsonKey(std::ostream* stream,
                             const std::string& element_name,
-                            const std::string& name,
-                            const std::string& value,
-                            const std::string& indent,
-                            bool comma = true);
+                            const std::string& name, const std::string& value,
+                            const std::string& indent, bool comma = true);
   static void OutputJsonKey(std::ostream* stream,
                             const std::string& element_name,
-                            const std::string& name,
-                            int value,
-                            const std::string& indent,
-                            bool comma = true);
+                            const std::string& name, int value,
+                            const std::string& indent, bool comma = true);
 
   // Streams a test suite JSON stanza containing the given test result.
   //
@@ -4484,7 +4489,9 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
   // The output file.
   const std::string output_file_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+  JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete;
+  JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) =
+      delete;
 };
 
 // Creates a new JsonUnitTestResultPrinter.
@@ -4496,7 +4503,7 @@ JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
 }
 
 void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
+                                                   int /*iteration*/) {
   FILE* jsonout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintJsonUnitTest(&stream, unit_test);
@@ -4562,55 +4569,48 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
     return "";
   // YYYY-MM-DDThh:mm:ss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "Z";
 }
 
 static inline std::string Indent(size_t width) {
   return std::string(width, ' ');
 }
 
-void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value,
-    const std::string& indent,
-    bool comma) {
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream,
+                                              const std::string& element_name,
+                                              const std::string& name,
+                                              const std::string& value,
+                                              const std::string& indent,
+                                              bool comma) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
-  if (comma)
-    *stream << ",\n";
+  if (comma) *stream << ",\n";
 }
 
 void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    int value,
-    const std::string& indent,
-    bool comma) {
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, int value, const std::string& indent, bool comma) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": " << StreamableToString(value);
-  if (comma)
-    *stream << ",\n";
+  if (comma) *stream << ",\n";
 }
 
 // Streams a test suite JSON stanza containing the given test result.
@@ -4620,7 +4620,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
   *stream << Indent(4) << "{\n";
   OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
   OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
     OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
     OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
@@ -4674,11 +4674,14 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
     OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
                   kIndent);
   }
-  if (GTEST_FLAG(list_tests)) {
-    OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
-    OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+
+  OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+  if (GTEST_FLAG_GET(list_tests)) {
     *stream << "\n" << Indent(8) << "}";
     return;
+  } else {
+    *stream << ",\n";
   }
 
   OutputJsonKey(stream, kTestsuite, "status",
@@ -4710,7 +4713,9 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
     if (part.failed()) {
       *stream << ",\n";
       if (++failures == 1) {
-        *stream << kIndent << "\"" << "failures" << "\": [\n";
+        *stream << kIndent << "\""
+                << "failures"
+                << "\": [\n";
       }
       const std::string location =
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
@@ -4723,8 +4728,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
     }
   }
 
-  if (failures > 0)
-    *stream << "\n" << kIndent << "]";
+  if (failures > 0) *stream << "\n" << kIndent << "]";
   *stream << "\n" << Indent(8) << "}";
 }
 
@@ -4738,7 +4742,7 @@ void JsonUnitTestResultPrinter::PrintJsonTestSuite(
   OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
   OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
                 kIndent);
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputJsonKey(stream, kTestsuite, "failures",
                   test_suite.failed_test_count(), kIndent);
     OutputJsonKey(stream, kTestsuite, "disabled",
@@ -4785,7 +4789,7 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
   OutputJsonKey(stream, kTestsuites, "disabled",
                 unit_test.reportable_disabled_test_count(), kIndent);
   OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
                   kIndent);
   }
@@ -4820,7 +4824,9 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
     OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
   }
 
-  *stream << "\n" << kIndent << "]\n" << "}\n";
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
 }
 
 void JsonUnitTestResultPrinter::PrintJsonTestList(
@@ -4855,7 +4861,8 @@ std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
   Message attributes;
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
-    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+    attributes << ",\n"
+               << indent << "\"" << property.key() << "\": "
                << "\"" << EscapeJson(property.value()) << "\"";
   }
   return attributes.GetString();
@@ -4895,14 +4902,14 @@ void StreamingListener::SocketWriter::MakeConnection() {
 
   addrinfo hints;
   memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_family = AF_UNSPEC;  // To allow both IPv4 and IPv6 addresses.
   hints.ai_socktype = SOCK_STREAM;
   addrinfo* servinfo = nullptr;
 
   // Use the getaddrinfo() to get a linked list of IP addresses for
   // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  const int error_num =
+      getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
   if (error_num != 0) {
     GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
                         << gai_strerror(error_num);
@@ -4911,8 +4918,8 @@ void StreamingListener::SocketWriter::MakeConnection() {
   // Loop through all the results and connect to the first we can.
   for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
        cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+                     cur_addr->ai_protocol);
     if (sockfd_ != -1) {
       // Connect the client socket to the server socket.
       if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
@@ -4962,7 +4969,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   for (int i = 0; i < raw_stack_size; ++i) {
     if (raw_stack[i] == caller_frame &&
-        !GTEST_FLAG(show_internal_stack_frames)) {
+        !GTEST_FLAG_GET(show_internal_stack_frames)) {
       // Add a marker to the trace and stop adding frames.
       absl::StrAppend(&result, kElidedFramesMarker, "\n");
       break;
@@ -4981,7 +4988,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   return result;
 
-#else  // !GTEST_HAS_ABSL
+#else   // !GTEST_HAS_ABSL
   static_cast<void>(max_depth);
   static_cast<void>(skip_count);
   return "";
@@ -5005,14 +5012,14 @@ void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
 class ScopedPrematureExitFile {
  public:
   explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath ?
-                                 premature_exit_filepath : "") {
+      : premature_exit_filepath_(
+            premature_exit_filepath ? premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
     if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
-      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w");
       fwrite("0", 1, 1, pfile);
       fclose(pfile);
     }
@@ -5034,7 +5041,8 @@ class ScopedPrematureExitFile {
  private:
   const std::string premature_exit_filepath_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+  ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete;
+  ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete;
 };
 
 }  // namespace internal
@@ -5208,7 +5216,7 @@ int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
 // Gets the time of the test program start, in ms from the start of the
 // UNIX epoch.
 internal::TimeInMillis UnitTest::start_timestamp() const {
-    return impl()->start_timestamp();
+  return impl()->start_timestamp();
 }
 
 // Gets the elapsed time, in milliseconds.
@@ -5251,9 +5259,7 @@ TestSuite* UnitTest::GetMutableTestSuite(int i) {
 
 // Returns the list of event listeners that can be used to track events
 // inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
+TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); }
 
 // Registers and returns a global test environment.  When a test
 // program is run, all global test environments will be set-up in the
@@ -5278,12 +5284,11 @@ Environment* UnitTest::AddEnvironment(Environment* env) {
 // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
 // this to report their results.  The user code should use the
 // assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
-    TestPartResult::Type result_type,
-    const char* file_name,
-    int line_number,
-    const std::string& message,
-    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+                                 const char* file_name, int line_number,
+                                 const std::string& message,
+                                 const std::string& os_stack_trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   Message msg;
   msg << message;
 
@@ -5293,8 +5298,9 @@ void UnitTest::AddTestPartResult(
 
     for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
       const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
+      msg << "\n"
+          << internal::FormatFileLocation(trace.file, trace.line) << " "
+          << trace.message;
     }
   }
 
@@ -5304,8 +5310,8 @@ void UnitTest::AddTestPartResult(
 
   const TestPartResult result = TestPartResult(
       result_type, file_name, line_number, msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
+  impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      result);
 
   if (result_type != TestPartResult::kSuccess &&
       result_type != TestPartResult::kSkip) {
@@ -5314,7 +5320,7 @@ void UnitTest::AddTestPartResult(
     // in the code (perhaps in order to use Google Test assertions
     // with another testing framework) and specify the former on the
     // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
+    if (GTEST_FLAG_GET(break_on_failure)) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
       // Using DebugBreak on Windows allows gtest to still break into a debugger
       // when a failure happens and both the --gtest_break_on_failure and
@@ -5331,7 +5337,7 @@ void UnitTest::AddTestPartResult(
       // portability: some debuggers don't correctly trap abort().
       *static_cast<volatile int*>(nullptr) = 1;
 #endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
+    } else if (GTEST_FLAG_GET(throw_on_failure)) {
 #if GTEST_HAS_EXCEPTIONS
       throw internal::GoogleTestFailureException(result);
 #else
@@ -5360,7 +5366,7 @@ void UnitTest::RecordProperty(const std::string& key,
 // from the main thread.
 int UnitTest::Run() {
   const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+      GTEST_FLAG_GET(internal_run_death_test).length() > 0;
 
   // Google Test implements this protocol for catching that a test
   // program exits before returning control to Google Test:
@@ -5390,7 +5396,7 @@ int UnitTest::Run() {
 
   // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
   // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+  impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions));
 
 #if GTEST_OS_WINDOWS
   // Either the user wants Google Test to catch exceptions thrown by the
@@ -5398,26 +5404,26 @@ int UnitTest::Run() {
   // process. In either case the user does not want to see pop-up dialogs
   // about crashes - they are expected.
   if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
     // SetErrorMode doesn't exist on CE.
     SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
                  SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
     // Death test children can be terminated with _abort().  On Windows,
     // _abort() can show a dialog with a warning message.  This forces the
     // abort message to go to stderr instead.
     _set_error_mode(_OUT_TO_STDERR);
-# endif
+#endif
 
-# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
     // In the debug version, Visual Studio pops up a separate dialog
     // offering a choice to debug the aborted program. We need to suppress
     // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
     // executed. Google Test will notify the user of any unexpected
     // failure via stderr.
-    if (!GTEST_FLAG(break_on_failure))
+    if (!GTEST_FLAG_GET(break_on_failure))
       _set_abort_behavior(
           0x0,                                    // Clear the following flags:
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
@@ -5431,14 +5437,15 @@ int UnitTest::Run() {
                               _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
       (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
     }
-# endif
+#endif
   }
 #endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+             impl(), &internal::UnitTestImpl::RunAllTests,
+             "auxiliary test code (environments or event listeners)")
+             ? 0
+             : 1;
 }
 
 // Returns the working directory when the first TEST() or TEST_F() was
@@ -5483,14 +5490,10 @@ UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
 }
 
 // Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
 
 // Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
+UnitTest::~UnitTest() { delete impl_; }
 
 // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
 // Google Test trace stack.
@@ -5501,8 +5504,7 @@ void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
 }
 
 // Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().pop_back();
 }
@@ -5599,12 +5601,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
 // Initializes event listeners for streaming test results in string form.
 // Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string& target = GTEST_FLAG(stream_result_to);
+  const std::string& target = GTEST_FLAG_GET(stream_result_to);
   if (!target.empty()) {
     const size_t pos = target.find(':');
     if (pos != std::string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
+      listeners()->Append(
+          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
     } else {
       GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
                           << "\" ignored.";
@@ -5642,7 +5644,7 @@ void UnitTestImpl::PostFlagParsingInit() {
     // to shut down the default XML output before invoking RUN_ALL_TESTS.
     ConfigureXmlOutput();
 
-    if (GTEST_FLAG(brief)) {
+    if (GTEST_FLAG_GET(brief)) {
       listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
     }
 
@@ -5652,7 +5654,7 @@ void UnitTestImpl::PostFlagParsingInit() {
 #endif  // GTEST_CAN_STREAM_RESULTS_
 
 #if GTEST_HAS_ABSL
-    if (GTEST_FLAG(install_failure_signal_handler)) {
+    if (GTEST_FLAG_GET(install_failure_signal_handler)) {
       absl::FailureSignalHandlerOptions options;
       absl::InstallFailureSignalHandler(options);
     }
@@ -5710,9 +5712,9 @@ TestSuite* UnitTestImpl::GetTestSuite(
   auto* const new_test_suite =
       new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
 
+  const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter);
   // Is this a death test suite?
-  if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
-                                               kDeathTestSuiteFilter)) {
+  if (death_test_suite_filter.MatchesName(test_suite_name)) {
     // Yes.  Inserts the test suite after the last death test suite
     // defined so far.  This only works when the test suites haven't
     // been shuffled.  Otherwise we may end up running a death test
@@ -5749,8 +5751,7 @@ bool UnitTestImpl::RunAllTests() {
   const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
+  if (g_help_flag) return true;
 
   // Repeats the call to the post-flag parsing initialization in case the
   // user didn't call InitGoogleTest.
@@ -5768,11 +5769,11 @@ bool UnitTestImpl::RunAllTests() {
 #if GTEST_HAS_DEATH_TEST
   in_subprocess_for_death_test =
       (internal_run_death_test_flag_.get() != nullptr);
-# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
   if (in_subprocess_for_death_test) {
     GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
   }
-# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -5780,19 +5781,18 @@ bool UnitTestImpl::RunAllTests() {
 
   // Compares the full test names with the filter to decide which
   // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+  const bool has_tests_to_run =
+      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+                               : IGNORE_SHARDING_PROTOCOL) > 0;
 
   // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
+  if (GTEST_FLAG_GET(list_tests)) {
     // This must be called *after* FilterTests() has been called.
     ListTestsMatchingFilter();
     return true;
   }
 
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+  random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed));
 
   // True if and only if at least one test has failed.
   bool failed = false;
@@ -5804,9 +5804,21 @@ bool UnitTestImpl::RunAllTests() {
 
   // How many times to repeat the tests?  We don't want to repeat them
   // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat);
+
   // Repeats forever if the repeat count is negative.
   const bool gtest_repeat_forever = repeat < 0;
+
+  // Should test environments be set up and torn down for each repeat, or only
+  // set up on the first and torn down on the last iteration? If there is no
+  // "last" iteration because the tests will repeat forever, always recreate the
+  // environments to avoid leaks in case one of the environments is using
+  // resources that are external to this process. Without this check there would
+  // be no way to clean up those external resources automatically.
+  const bool recreate_environments_when_repeating =
+      GTEST_FLAG_GET(recreate_environments_when_repeating) ||
+      gtest_repeat_forever;
+
   for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
     // We want to preserve failures generated by ad-hoc test
     // assertions executed before RUN_ALL_TESTS().
@@ -5815,7 +5827,7 @@ bool UnitTestImpl::RunAllTests() {
     Timer timer;
 
     // Shuffles test suites and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+    if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) {
       random()->Reseed(static_cast<uint32_t>(random_seed_));
       // This should be done before calling OnTestIterationStart(),
       // such that a test event listener can see the actual test order
@@ -5828,10 +5840,13 @@ bool UnitTestImpl::RunAllTests() {
 
     // Runs each test suite if there is at least one test to run.
     if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
+      // Sets up all environments beforehand. If test environments aren't
+      // recreated for each iteration, only do so on the first iteration.
+      if (i == 0 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsSetUpStart(*parent_);
+        ForEach(environments_, SetUpEnvironment);
+        repeater->OnEnvironmentsSetUpEnd(*parent_);
+      }
 
       // Runs the tests only if there was no fatal failure or skip triggered
       // during global set-up.
@@ -5853,7 +5868,7 @@ bool UnitTestImpl::RunAllTests() {
         for (int test_index = 0; test_index < total_test_suite_count();
              test_index++) {
           GetMutableSuiteCase(test_index)->Run();
-          if (GTEST_FLAG(fail_fast) &&
+          if (GTEST_FLAG_GET(fail_fast) &&
               GetMutableSuiteCase(test_index)->Failed()) {
             for (int j = test_index + 1; j < total_test_suite_count(); j++) {
               GetMutableSuiteCase(j)->Skip();
@@ -5871,11 +5886,15 @@ bool UnitTestImpl::RunAllTests() {
         }
       }
 
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
+      // Tears down all environments in reverse order afterwards. If test
+      // environments aren't recreated for each iteration, only do so on the
+      // last iteration.
+      if (i == repeat - 1 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsTearDownStart(*parent_);
+        std::for_each(environments_.rbegin(), environments_.rend(),
+                      TearDownEnvironment);
+        repeater->OnEnvironmentsTearDownEnd(*parent_);
+      }
     }
 
     elapsed_time_ = timer.Elapsed();
@@ -5896,7 +5915,7 @@ bool UnitTestImpl::RunAllTests() {
     // (it's always safe to unshuffle the tests).
     UnshuffleTests();
 
-    if (GTEST_FLAG(shuffle)) {
+    if (GTEST_FLAG_GET(shuffle)) {
       // Picks a new random seed for each iteration.
       random_seed_ = GetNextRandomSeed(random_seed_);
     }
@@ -5947,8 +5966,7 @@ void WriteToShardStatusFileIfNeeded() {
 // an error and exits. If in_subprocess_for_death_test, sharding is
 // disabled because it must only be applied to the original test
 // process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
+bool ShouldShard(const char* total_shards_env, const char* shard_index_env,
                  bool in_subprocess_for_death_test) {
   if (in_subprocess_for_death_test) {
     return false;
@@ -5960,27 +5978,27 @@ bool ShouldShard(const char* total_shards_env,
   if (total_shards == -1 && shard_index == -1) {
     return false;
   } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
+    const Message msg = Message() << "Invalid environment variables: you have "
+                                  << kTestShardIndex << " = " << shard_index
+                                  << ", but have left " << kTestTotalShards
+                                  << " unset.\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (total_shards != -1 && shard_index == -1) {
     const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
+                        << "Invalid environment variables: you have "
+                        << kTestTotalShards << " = " << total_shards
+                        << ", but have left " << kTestShardIndex << " unset.\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    const Message msg =
+        Message() << "Invalid environment variables: we require 0 <= "
+                  << kTestShardIndex << " < " << kTestTotalShards
+                  << ", but you have " << kTestShardIndex << "=" << shard_index
+                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
@@ -6022,11 +6040,16 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
 // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
 // . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
+  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
+                                   : -1;
+  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
+                                  : -1;
+
+  const PositiveAndNegativeUnitTestFilter gtest_flag_filter(
+      GTEST_FLAG_GET(filter));
+  const UnitTestFilter disable_test_filter(kDisableTestFilter);
   // num_runnable_tests are the number of tests that will
   // run across all shards (i.e., match filter and are not disabled).
   // num_selected_tests are the number of tests to be run on
@@ -6042,18 +6065,17 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
       const std::string test_name(test_info->name());
       // A test is disabled if test suite name or test name matches
       // kDisableTestFilter.
-      const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
-                                   test_suite_name, kDisableTestFilter) ||
-                               internal::UnitTestOptions::MatchesFilter(
-                                   test_name, kDisableTestFilter);
+      const bool is_disabled =
+          disable_test_filter.MatchesName(test_suite_name) ||
+          disable_test_filter.MatchesName(test_name);
       test_info->is_disabled_ = is_disabled;
 
-      const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
-          test_suite_name, test_name);
+      const bool matches_filter =
+          gtest_flag_filter.MatchesTest(test_suite_name, test_name);
       test_info->matches_filter_ = matches_filter;
 
       const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) &&
           matches_filter;
 
       const bool is_in_another_shard =
@@ -6222,8 +6244,8 @@ void UnitTestImpl::UnshuffleTests() {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                            int skip_count) {
+GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string
+GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) {
   // We pass skip_count + 1 to skip this wrapper function in addition
   // to what the user really wants to skip.
   return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
@@ -6233,7 +6255,7 @@ std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
 // suppress unreachable code warnings.
 namespace {
 class ClassUniqueToAlwaysTrue {};
-}
+}  // namespace
 
 bool IsTrue(bool condition) { return condition; }
 
@@ -6241,8 +6263,7 @@ bool AlwaysTrue() {
 #if GTEST_HAS_EXCEPTIONS
   // This condition is always false so AlwaysTrue() never actually throws,
   // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
+  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
 #endif  // GTEST_HAS_EXCEPTIONS
   return true;
 }
@@ -6264,13 +6285,14 @@ bool SkipPrefix(const char* prefix, const char** pstr) {
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-static const char* ParseFlagValue(const char* str, const char* flag,
+static const char* ParseFlagValue(const char* str, const char* flag_name,
                                   bool def_optional) {
   // str and flag must not be NULL.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag_name == nullptr) return nullptr;
 
   // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const std::string flag_str =
+      std::string("--") + GTEST_FLAG_PREFIX_ + flag_name;
   const size_t flag_len = flag_str.length();
   if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
 
@@ -6301,9 +6323,9 @@ static const char* ParseFlagValue(const char* str, const char* flag,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseFlag(const char* str, const char* flag_name, bool* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
+  const char* const value_str = ParseFlagValue(str, flag_name, true);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -6317,16 +6339,16 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
+bool ParseFlag(const char* str, const char* flag_name, int32_t* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
 
   // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
+  return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+                    value);
 }
 
 // Parses a string for a string flag, in the form of "--flag=value".
@@ -6334,9 +6356,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 template <typename String>
-static bool ParseStringFlag(const char* str, const char* flag, String* value) {
+static bool ParseFlag(const char* str, const char* flag_name, String* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -6353,8 +6375,7 @@ static bool ParseStringFlag(const char* str, const char* flag, String* value) {
 // GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
 // internal flags and do not trigger the help message.
 static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
+  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
           SkipPrefix("/", &str)) &&
          !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
          (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
@@ -6437,6 +6458,10 @@ static const char kColorEncodedHelpMessage[] =
     "random_seed=@Y[NUMBER]@D\n"
     "      Random number seed to use for shuffling test orders (between 1 and\n"
     "      99999, or 0 to use a seed based on the current time).\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "recreate_environments_when_repeating@D\n"
+    "      Sets up and tears down the global test environment on each repeat\n"
+    "      of the test.\n"
     "\n"
     "Test Output:\n"
     "  @G--" GTEST_FLAG_PREFIX_
@@ -6454,18 +6479,18 @@ static const char kColorEncodedHelpMessage[] =
     "      Generate a JSON or XML report in the given directory or with the "
     "given\n"
     "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
-# if GTEST_CAN_STREAM_RESULTS_
+#if GTEST_CAN_STREAM_RESULTS_
     "  @G--" GTEST_FLAG_PREFIX_
     "stream_result_to=@YHOST@G:@YPORT@D\n"
     "      Stream test results to the given server.\n"
-# endif  // GTEST_CAN_STREAM_RESULTS_
+#endif  // GTEST_CAN_STREAM_RESULTS_
     "\n"
     "Assertion Behavior:\n"
-# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
     "  @G--" GTEST_FLAG_PREFIX_
     "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
     "      Set the default death test style.\n"
-# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
     "  @G--" GTEST_FLAG_PREFIX_
     "break_on_failure@D\n"
     "      Turn assertion failures into debugger break-points.\n"
@@ -6497,41 +6522,44 @@ static const char kColorEncodedHelpMessage[] =
     "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
 static bool ParseGoogleTestFlag(const char* const arg) {
-  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                       &GTEST_FLAG(also_run_disabled_tests)) ||
-         ParseBoolFlag(arg, kBreakOnFailureFlag,
-                       &GTEST_FLAG(break_on_failure)) ||
-         ParseBoolFlag(arg, kCatchExceptionsFlag,
-                       &GTEST_FLAG(catch_exceptions)) ||
-         ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-         ParseStringFlag(arg, kDeathTestStyleFlag,
-                         &GTEST_FLAG(death_test_style)) ||
-         ParseBoolFlag(arg, kDeathTestUseFork,
-                       &GTEST_FLAG(death_test_use_fork)) ||
-         ParseBoolFlag(arg, kFailFast, &GTEST_FLAG(fail_fast)) ||
-         ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-         ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                         &GTEST_FLAG(internal_run_death_test)) ||
-         ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-         ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-         ParseBoolFlag(arg, kBriefFlag, &GTEST_FLAG(brief)) ||
-         ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-         ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
-         ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-         ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-         ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-         ParseInt32Flag(arg, kStackTraceDepthFlag,
-                        &GTEST_FLAG(stack_trace_depth)) ||
-         ParseStringFlag(arg, kStreamResultToFlag,
-                         &GTEST_FLAG(stream_result_to)) ||
-         ParseBoolFlag(arg, kThrowOnFailureFlag, &GTEST_FLAG(throw_on_failure));
+#define GTEST_INTERNAL_PARSE_FLAG(flag_name)  \
+  do {                                        \
+    auto value = GTEST_FLAG_GET(flag_name);   \
+    if (ParseFlag(arg, #flag_name, &value)) { \
+      GTEST_FLAG_SET(flag_name, value);       \
+      return true;                            \
+    }                                         \
+  } while (false)
+
+  GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests);
+  GTEST_INTERNAL_PARSE_FLAG(break_on_failure);
+  GTEST_INTERNAL_PARSE_FLAG(catch_exceptions);
+  GTEST_INTERNAL_PARSE_FLAG(color);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_style);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork);
+  GTEST_INTERNAL_PARSE_FLAG(fail_fast);
+  GTEST_INTERNAL_PARSE_FLAG(filter);
+  GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test);
+  GTEST_INTERNAL_PARSE_FLAG(list_tests);
+  GTEST_INTERNAL_PARSE_FLAG(output);
+  GTEST_INTERNAL_PARSE_FLAG(brief);
+  GTEST_INTERNAL_PARSE_FLAG(print_time);
+  GTEST_INTERNAL_PARSE_FLAG(print_utf8);
+  GTEST_INTERNAL_PARSE_FLAG(random_seed);
+  GTEST_INTERNAL_PARSE_FLAG(repeat);
+  GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating);
+  GTEST_INTERNAL_PARSE_FLAG(shuffle);
+  GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth);
+  GTEST_INTERNAL_PARSE_FLAG(stream_result_to);
+  GTEST_INTERNAL_PARSE_FLAG(throw_on_failure);
+  return false;
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 static void LoadFlagsFromFile(const std::string& path) {
   FILE* flagfile = posix::FOpen(path.c_str(), "r");
   if (!flagfile) {
-    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile)
                       << "\"";
   }
   std::string contents(ReadEntireFile(flagfile));
@@ -6539,10 +6567,8 @@ static void LoadFlagsFromFile(const std::string& path) {
   std::vector<std::string> lines;
   SplitString(contents, '\n', &lines);
   for (size_t i = 0; i < lines.size(); ++i) {
-    if (lines[i].empty())
-      continue;
-    if (!ParseGoogleTestFlag(lines[i].c_str()))
-      g_help_flag = true;
+    if (lines[i].empty()) continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
   }
 }
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -6552,25 +6578,23 @@ static void LoadFlagsFromFile(const std::string& path) {
 // instantiated to either char or wchar_t.
 template <typename CharType>
 void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  std::string flagfile_value;
   for (int i = 1; i < *argc; i++) {
     const std::string arg_string = StreamableToString(argv[i]);
     const char* const arg = arg_string.c_str();
 
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
+    using internal::ParseFlag;
 
     bool remove_flag = false;
     if (ParseGoogleTestFlag(arg)) {
       remove_flag = true;
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
-      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+    } else if (ParseFlag(arg, "flagfile", &flagfile_value)) {
+      GTEST_FLAG_SET(flagfile, flagfile_value);
+      LoadFlagsFromFile(flagfile_value);
       remove_flag = true;
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
+    } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) {
       // Both help flag and unrecognized Google Test flags (excluding
       // internal ones) trigger help display.
       g_help_flag = true;
@@ -6605,7 +6629,27 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.
 void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+#if GTEST_HAS_ABSL
+  if (*argc > 0) {
+    // absl::ParseCommandLine() requires *argc > 0.
+    auto positional_args = absl::flags_internal::ParseCommandLineImpl(
+        *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs,
+        absl::flags_internal::UsageFlagsAction::kHandleUsage,
+        absl::flags_internal::OnUndefinedFlag::kReportUndefined);
+    // Any command-line positional arguments not part of any command-line flag
+    // (or arguments to a flag) are copied back out to argv, with the program
+    // invocation name at position 0, and argc is resized. This includes
+    // positional arguments after the flag-terminating delimiter '--'.
+    // See https://abseil.io/docs/cpp/guides/flags.
+    std::copy(positional_args.begin(), positional_args.end(), argv);
+    if (static_cast<int>(positional_args.size()) < *argc) {
+      argv[positional_args.size()] = nullptr;
+      *argc = static_cast<int>(positional_args.size());
+    }
+  }
+#else
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
+#endif
 
   // Fix the value of *_NSGetArgc() on macOS, but if and only if
   // *_NSGetArgv() == argv
@@ -6640,6 +6684,12 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 
 #if GTEST_HAS_ABSL
   absl::InitializeSymbolizer(g_argvs[0].c_str());
+
+  // When using the Abseil Flags library, set the program usage message to the
+  // help message, but remove the color-encoding from the message first.
+  absl::SetProgramUsageMessage(absl::StrReplaceAll(
+      kColorEncodedHelpMessage,
+      {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}}));
 #endif  // GTEST_HAS_ABSL
 
   ParseGoogleTestFlagsOnly(argc, argv);
@@ -6660,7 +6710,7 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 void InitGoogleTest(int* argc, char** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6670,7 +6720,7 @@ void InitGoogleTest(int* argc, char** argv) {
 void InitGoogleTest(int* argc, wchar_t** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6686,42 +6736,42 @@ void InitGoogleTest() {
 
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(&argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
+#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+// Return value of first environment variable that is set and contains
+// a non-empty string. If there are none, return the "fallback" string.
+// Since we like the temporary directory to have a directory separator suffix,
+// add it if not provided in the environment variable value.
+static std::string GetTempDirFromEnv(
+    std::initializer_list<const char*> environment_variables,
+    const char* fallback, char separator) {
+  for (const char* variable_name : environment_variables) {
+    const char* value = internal::posix::GetEnv(variable_name);
+    if (value != nullptr && value[0] != '\0') {
+      if (value[strlen(value) - 1] != separator) {
+        return std::string(value).append(1, separator);
+      }
+      return value;
+    }
+  }
+  return fallback;
+}
+#endif
+
 std::string TempDir() {
 #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
   return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#elif GTEST_OS_WINDOWS_MOBILE
-  return "\\temp\\";
-#elif GTEST_OS_WINDOWS
-  const char* temp_dir = internal::posix::GetEnv("TEMP");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "\\temp\\";
-  } else if (temp_dir[strlen(temp_dir) - 1] == '\\') {
-    return temp_dir;
-  } else {
-    return std::string(temp_dir) + "\\";
-  }
+#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\');
 #elif GTEST_OS_LINUX_ANDROID
-  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "/data/local/tmp/";
-  } else {
-    return temp_dir;
-  }
-#elif GTEST_OS_LINUX
-  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "/tmp/";
-  } else {
-    return temp_dir;
-  }
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/');
 #else
-  return "/tmp/";
-#endif  // GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/');
+#endif
 }
 
 // Class ScopedTrace
@@ -6738,8 +6788,7 @@ void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
 }
 
 // Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
   UnitTest::GetInstance()->PopGTestTrace();
 }
 
diff --git a/third_party/googletest/src/src/gtest_main.cc b/third_party/googletest/src/src/gtest_main.cc
index 46b27c3d7d..44976375c9 100644
--- a/third_party/googletest/src/src/gtest_main.cc
+++ b/third_party/googletest/src/src/gtest_main.cc
@@ -28,15 +28,14 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <cstdio>
+
 #include "gtest/gtest.h"
 
 #if GTEST_OS_ESP8266 || GTEST_OS_ESP32
 #if GTEST_OS_ESP8266
 extern "C" {
 #endif
-void setup() {
-  testing::InitGoogleTest();
-}
+void setup() { testing::InitGoogleTest(); }
 
 void loop() { RUN_ALL_TESTS(); }
 

From fd615b4348822f4ba77e3d5731018cf229110b93 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 27 May 2022 21:53:49 -0700
Subject: [PATCH 429/926] CHECK_MEM_ERROR: add an assert for a valid jmp target

callers of CHECK_MEM_ERROR() expect failures to not return

tested with:
configure --enable-debug --enable-vp9-postproc --enable-postproc \
  --enable-multi-res-encoding --enable-vp9-temporal-denoising \
  --enable-error-concealment

--enable-internal-stats has unrelated assertion failures currently

Change-Id: Ic12073b1ae80a6f434f14d24f652e64d30f63eea
---
 vp8/decoder/onyxd_int.h | 4 ++++
 vp8/encoder/onyx_int.h  | 4 ++++
 vp9/common/vp9_common.h | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index cf2c066d9b..a6bedc4faf 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -11,6 +11,8 @@
 #ifndef VPX_VP8_DECODER_ONYXD_INT_H_
 #define VPX_VP8_DECODER_ONYXD_INT_H_
 
+#include <assert.h>
+
 #include "vpx_config.h"
 #include "vp8/common/onyxd.h"
 #include "treereader.h"
@@ -136,6 +138,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval, expr)                                         \
   do {                                                                      \
+    assert(pbi->common.error.setjmp);                                       \
     (lval) = (expr);                                                        \
     if (!(lval))                                                            \
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,           \
@@ -145,6 +148,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
 #else
 #define CHECK_MEM_ERROR(lval, expr)                               \
   do {                                                            \
+    assert(pbi->common.error.setjmp);                             \
     (lval) = (expr);                                              \
     if (!(lval))                                                  \
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 726dcc9466..7951f0a77e 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -11,7 +11,9 @@
 #ifndef VPX_VP8_ENCODER_ONYX_INT_H_
 #define VPX_VP8_ENCODER_ONYX_INT_H_
 
+#include <assert.h>
 #include <stdio.h>
+
 #include "vpx_config.h"
 #include "vp8/common/onyx.h"
 #include "treewriter.h"
@@ -730,6 +732,7 @@ void vp8_set_speed_features(VP8_COMP *cpi);
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval, expr)                                         \
   do {                                                                      \
+    assert(cpi->common.error.setjmp);                                       \
     (lval) = (expr);                                                        \
     if (!(lval))                                                            \
       vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,           \
@@ -739,6 +742,7 @@ void vp8_set_speed_features(VP8_COMP *cpi);
 #else
 #define CHECK_MEM_ERROR(lval, expr)                               \
   do {                                                            \
+    assert(cpi->common.error.setjmp);                             \
     (lval) = (expr);                                              \
     if (!(lval))                                                  \
       vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 3cec53bfd8..8d2bed38e5 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -49,6 +49,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(cm, lval, expr)                                     \
   do {                                                                      \
+    assert(&(cm)->error.setjmp);                                            \
     (lval) = (expr);                                                        \
     if (!(lval))                                                            \
       vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,                 \
@@ -58,6 +59,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 #else
 #define CHECK_MEM_ERROR(cm, lval, expr)                     \
   do {                                                      \
+    assert(&(cm)->error.setjmp);                            \
     (lval) = (expr);                                        \
     if (!(lval))                                            \
       vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \

From b39722f851f5505b2fa61f47694f307c9a09794b Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 12 Sep 2022 07:40:39 -0700
Subject: [PATCH 430/926] Add vpx_highbd_sad32x{64,32,16}x4d_avx2.

~2.4x faster than the sse2 version.

Bug: b/245917257

Change-Id: I6df2bd62b46e5e175c8ad80daa6de3a1c313db0f
---
 test/sad_test.cc                |   9 +++
 vpx_dsp/vpx_dsp_rtcd_defs.pl    |   6 +-
 vpx_dsp/x86/highbd_sad4d_avx2.c | 113 ++++++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+), 3 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 92c9e6332a..b3ad96ab8c 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1080,12 +1080,21 @@ const SadMxNx4Param x4d_avx2_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2),
 #if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8),
   SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8),
   SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8),
   SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10),
   SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10),
   SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10),
   SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12),
   SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12),
   SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12),
   SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index df2c8da74e..f5f5f9dd65 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1039,13 +1039,13 @@ ()
   specialize qw/vpx_highbd_sad64x32x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x64x4d sse2 neon/;
+  specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x32x4d sse2 neon/;
+  specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x16x4d sse2 neon/;
+  specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
index 46c7e4fbc8..3384694f39 100644
--- a/vpx_dsp/x86/highbd_sad4d_avx2.c
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -21,6 +21,119 @@ static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
   _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
+static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; i++) {
+    __m256i r[8];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
+    r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
+    r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
+    r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
+
+    // absolute differences between every ref[] to src
+    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
+    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
+    r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
+    r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
+    r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
+    r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
+
+    // sum every abs diff
+    sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
+    sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
+    sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
+    sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD32XNX4D(n)                                                   \
+  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
+    uint16_t *refs[4];                                                         \
+    __m256i sums_16[4];                                                        \
+    __m256i sums_32[4];                                                        \
+    int i;                                                                     \
+                                                                               \
+    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
+    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
+    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
+    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
+    sums_32[0] = _mm256_setzero_si256();                                       \
+    sums_32[1] = _mm256_setzero_si256();                                       \
+    sums_32[2] = _mm256_setzero_si256();                                       \
+    sums_32[3] = _mm256_setzero_si256();                                       \
+                                                                               \
+    for (i = 0; i < (n / 8); ++i) {                                            \
+      sums_16[0] = _mm256_setzero_si256();                                     \
+      sums_16[1] = _mm256_setzero_si256();                                     \
+      sums_16[2] = _mm256_setzero_si256();                                     \
+      sums_16[3] = _mm256_setzero_si256();                                     \
+                                                                               \
+      highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);        \
+                                                                               \
+      /* sums_16 will outrange after 8 rows, so add current sums_16 to         \
+       * sums_32*/                                                             \
+      sums_32[0] = _mm256_add_epi32(                                           \
+          sums_32[0],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
+      sums_32[1] = _mm256_add_epi32(                                           \
+          sums_32[1],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
+      sums_32[2] = _mm256_add_epi32(                                           \
+          sums_32[2],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
+      sums_32[3] = _mm256_add_epi32(                                           \
+          sums_32[3],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
+                                                                               \
+      src += src_stride << 3;                                                  \
+    }                                                                          \
+    calc_final_4(sums_32, sad_array);                                          \
+  }
+
+// 32x64
+HIGHBD_SAD32XNX4D(64)
+
+// 32x32
+HIGHBD_SAD32XNX4D(32)
+
+// 32x16
+HIGHBD_SAD32XNX4D(16)
+
 static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
                                                const uint16_t *src,
                                                int src_stride,

From 34284e930a24f48d64220d34bc355e0883a3c569 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 14 Sep 2022 03:36:46 -0700
Subject: [PATCH 431/926] Add vpx_highbd_sad64x{64,32}x4d_avx2.

~2x faster than the sse2 version.

Bug: b/245917257

Change-Id: I4742950ab7b90d7f09e8d4687e1e967138acee39
---
 test/sad_test.cc                |   6 ++
 vpx_dsp/vpx_dsp_rtcd_defs.pl    |   4 +-
 vpx_dsp/x86/highbd_sad4d_avx2.c | 105 ++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index b3ad96ab8c..7e84ea0dbf 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1080,18 +1080,24 @@ const SadMxNx4Param x4d_avx2_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2),
 #if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8),
   SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8),
   SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8),
   SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8),
   SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8),
   SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8),
   SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10),
   SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10),
   SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10),
   SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10),
   SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10),
   SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10),
   SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12),
   SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12),
   SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12),
   SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index f5f5f9dd65..527d0e6e74 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1033,10 +1033,10 @@ ()
   # Multi-block SAD, comparing a reference to N independent blocks
   #
   add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad64x64x4d sse2 neon/;
+  specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad64x32x4d sse2 neon/;
+  specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
index 3384694f39..947b5e9772 100644
--- a/vpx_dsp/x86/highbd_sad4d_avx2.c
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -21,6 +21,111 @@ static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
   _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    int x;
+
+    for (x = 0; x < 4; ++x) {
+      __m256i r[4];
+      r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+      r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+      r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+      r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+      // absolute differences between every ref[] to src
+      r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+      r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+      r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+      r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+      // sum every abs diff
+      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+    }
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD64XNX4D(n)                                                   \
+  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
+    uint16_t *refs[4];                                                         \
+    __m256i sums_16[4];                                                        \
+    __m256i sums_32[4];                                                        \
+    int i;                                                                     \
+                                                                               \
+    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
+    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
+    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
+    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
+    sums_32[0] = _mm256_setzero_si256();                                       \
+    sums_32[1] = _mm256_setzero_si256();                                       \
+    sums_32[2] = _mm256_setzero_si256();                                       \
+    sums_32[3] = _mm256_setzero_si256();                                       \
+                                                                               \
+    for (i = 0; i < (n / 2); ++i) {                                            \
+      sums_16[0] = _mm256_setzero_si256();                                     \
+      sums_16[1] = _mm256_setzero_si256();                                     \
+      sums_16[2] = _mm256_setzero_si256();                                     \
+      sums_16[3] = _mm256_setzero_si256();                                     \
+                                                                               \
+      highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);        \
+                                                                               \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to         \
+       * sums_32*/                                                             \
+      sums_32[0] = _mm256_add_epi32(                                           \
+          sums_32[0],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
+      sums_32[1] = _mm256_add_epi32(                                           \
+          sums_32[1],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
+      sums_32[2] = _mm256_add_epi32(                                           \
+          sums_32[2],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
+      sums_32[3] = _mm256_add_epi32(                                           \
+          sums_32[3],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
+                                                                               \
+      src += src_stride << 1;                                                  \
+    }                                                                          \
+    calc_final_4(sums_32, sad_array);                                          \
+  }
+
+// 64x64
+HIGHBD_SAD64XNX4D(64)
+
+// 64x32
+HIGHBD_SAD64XNX4D(32)
+
 static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
                                                const uint16_t *src,
                                                int src_stride,

From 7ed6b47c607ad73b0a99901b04200b07fa81f24c Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Wed, 14 Sep 2022 11:40:50 -0700
Subject: [PATCH 432/926] L2E: Rework recode decisions for external max frame
 size and q

Allow to handle external q and external max frame size separately.
Rely on libvpx's decision to catch overshoot/undershoot and recode frames.

Previously, when external max frame size is set, we didn't handle
undershoot cases, and now we fall back to libvpx's decision to
recode a frame if overshoot/undershoot is seen.

Change-Id: Ic3eee042cfe104b528c5f2c6c82b98dd5d8fa8ca
---
 vp9/encoder/vp9_encoder.c | 49 ++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 91b64e5d13..ca3439d7c0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4367,7 +4367,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
-  int last_q_attempt = 0;
   int enable_acl;
 #ifdef AGGRESSIVE_VBR
   int qrange_adj = 1;
@@ -4381,8 +4380,18 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   // Maximal frame size allowed by the external rate control.
   // case: 0, we ignore the max frame size limit, and encode with the qindex
   // passed in by the external rate control model.
-  // case: -1, we take VP9's decision for the max frame size.
+  // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
+  // and may recode if undershoot/overshoot is seen.
+  // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
+  // case: -1, we take libvpx's decision for the max frame size, as well as
+  // the recode decision.
+  // Otherwise: if a specific size is given, libvpx's recode decision
+  // will respect the given size.
   int ext_rc_max_frame_size = 0;
+  // Use VP9's decision of qindex. This flag is in use only in external rate
+  // control model to help determine whether to recode when
+  // |ext_rc_max_frame_size| is 0.
+  int ext_rc_use_default_q = 1;
   const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
 
 #if CONFIG_RATE_CTRL
@@ -4520,8 +4529,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       // libvpx's default q.
       if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
         q = encode_frame_decision.q_index;
-        ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
+        ext_rc_use_default_q = 0;
       }
+      ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
     }
 
     vp9_set_quantizer(cpi, q);
@@ -4564,7 +4574,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
 
     if (cpi->ext_ratectrl.ready &&
         (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
-      last_q_attempt = q;
       // In general, for the external rate control, we take the qindex provided
       // as input and encode the frame with this qindex faithfully. However,
       // in some extreme scenarios, the provided qindex leads to a massive
@@ -4572,20 +4581,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       // to pick a new qindex and recode the frame. We return the new qindex
       // through the API to the external model.
       if (ext_rc_max_frame_size == 0) {
-        break;
+        if (!ext_rc_use_default_q) break;
       } else if (ext_rc_max_frame_size == -1) {
-        if (rc->projected_frame_size < rc->max_frame_bandwidth) {
-          break;
-        }
+        // Do nothing, fall back to libvpx's recode decision.
       } else {
-        if (rc->projected_frame_size < ext_rc_max_frame_size) {
-          break;
-        }
+        // Change the max frame size, used in libvpx's recode decision.
+        rc->max_frame_bandwidth = ext_rc_max_frame_size;
       }
-      rc->max_frame_bandwidth = ext_rc_max_frame_size;
-      // If the current frame size exceeds the ext_rc_max_frame_size,
-      // we adjust the worst qindex to meet the frame size constraint.
-      q_high = 255;
       ext_rc_recode = 1;
     }
 #if CONFIG_RATE_CTRL
@@ -4788,23 +4790,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
         rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
-    // Special handling of external max frame size constraint
-    if (ext_rc_recode) {
-      // If the largest q is not able to meet the max frame size limit,
-      // do nothing.
-      if (rc->projected_frame_size > ext_rc_max_frame_size &&
-          last_q_attempt == 255) {
-        break;
-      }
-      // If VP9's q selection leads to a smaller q, we force it to use
-      // a larger q to better approximate the external max frame size
-      // constraint.
-      if (rc->projected_frame_size > ext_rc_max_frame_size &&
-          q <= last_q_attempt) {
-        q = VPXMIN(255, last_q_attempt + 1);
-      }
-    }
-
     if (loop) {
       ++loop_count;
       ++loop_at_this_size;

From 3cd417b6d2380b993f33a0d8f342e4d16717d16e Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@gmail.com>
Date: Sat, 17 Sep 2022 07:54:40 +0900
Subject: [PATCH 433/926] fwd_txfm: remove avx2 file from non-hbd

Resolves warning on OS X:
file: libvpx_g.a(fwd_txfm_avx2.c.o) has no symbols

Change-Id: Ie8b290bb3ed329656beb883d552c98353f1ed5e5
---
 vpx_dsp/vpx_dsp.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 34e9d736db..f9a5c97dd2 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -226,7 +226,6 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
 ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct16x16_neon.c
@@ -239,6 +238,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.h
 DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_dct32x32_lsx.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH

From 884837a5805b77aec2e16d3dee910d59081ba9b0 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@gmail.com>
Date: Sun, 18 Sep 2022 10:26:00 +0900
Subject: [PATCH 434/926] quantize: test lowbd in highbd builds

Change-Id: I7af273e979415a8b8cafb7494728d2736862f4a5
---
 test/vp9_quantize_test.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 4ecdd91b06..a81775fd94 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -586,12 +586,16 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
                    &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
                    32, true),
+        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_10, 16, false),
         make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_12, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
         make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
                    &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
         make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
@@ -620,12 +624,16 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
         make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
                    VPX_BITS_10, 16, false),
         make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
                    VPX_BITS_12, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
         make_tuple(&vpx_highbd_quantize_b_32x32_neon,
                    &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
         make_tuple(&vpx_highbd_quantize_b_32x32_neon,

From f6939699b626fe0e8588f1bc639b955d89fb537c Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 21 Sep 2022 11:37:04 -0700
Subject: [PATCH 435/926] post_proc_sse2.c: quiet -Wuninitialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In file included from ../libvpx/vpx_dsp/x86/post_proc_sse2.c:12:
In function ‘_mm_add_epi16’,
    inlined from ‘vpx_mbpost_proc_down_sse2’ at ../libvpx/vpx_dsp/x86/post_proc_sse2.c:88:13:
/usr/lib/gcc/x86_64-linux-gnu/12/include/emmintrin.h:1060:35: warning: ‘below_context’ may be used uninitialized [-Wmaybe-uninitialized]
 1060 |   return (__m128i) ((__v8hu)__A + (__v8hu)__B);
      |                                   ^~~~~~~~~~~
../libvpx/vpx_dsp/x86/post_proc_sse2.c: In function ‘vpx_mbpost_proc_down_sse2’:
../libvpx/vpx_dsp/x86/post_proc_sse2.c:39:13: note: ‘below_context’ was declared here
   39 |     __m128i below_context;

Change-Id: I2fc592f121c4e85d0aff1640014c3444f5eb09fd
---
 vpx_dsp/x86/post_proc_sse2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx_dsp/x86/post_proc_sse2.c b/vpx_dsp/x86/post_proc_sse2.c
index d1029afc4f..119fa7cd1a 100644
--- a/vpx_dsp/x86/post_proc_sse2.c
+++ b/vpx_dsp/x86/post_proc_sse2.c
@@ -36,7 +36,7 @@ void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
     __m128i s = _mm_loadl_epi64((__m128i *)dst);
     __m128i sum, sumsq_0, sumsq_1;
     __m128i tmp_0, tmp_1;
-    __m128i below_context;
+    __m128i below_context = _mm_setzero_si128();
 
     s = _mm_unpacklo_epi8(s, zero);
 

From 8b0c92ebdfd0478812b2fbf84c0dfc762a2609bd Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 21 Sep 2022 12:15:16 -0700
Subject: [PATCH 436/926] resize_test.cc: quiet -Wmaybe-uninitialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

warning: ‘expected_w’ may be used uninitialized
Change-Id: I915efd82d3263250cea90391345f7683c1330fc8
---
 test/resize_test.cc | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index 212ff46975..e122a74742 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -95,10 +95,11 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
                          unsigned int initial_h, unsigned int *w,
                          unsigned int *h, bool flag_codec,
                          bool smaller_width_larger_size_) {
+  *w = initial_w;
+  *h = initial_h;
+
   if (smaller_width_larger_size_) {
     if (frame < 30) {
-      *w = initial_w;
-      *h = initial_h;
       return;
     }
     if (frame < 100) {
@@ -109,8 +110,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 10) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 20) {
@@ -124,8 +123,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 40) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 50) {
@@ -139,8 +136,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 70) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 80) {
@@ -159,8 +154,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 110) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 120) {
@@ -179,8 +172,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 150) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 160) {
@@ -199,8 +190,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 190) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 200) {
@@ -219,8 +208,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 230) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 240) {
@@ -234,8 +221,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 260) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   // Go down very low.
@@ -248,13 +233,9 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     // Cases that only works for VP9.
     // For VP9: Swap width and height of original.
     if (frame < 320) {
-      *w = initial_h;
-      *h = initial_w;
       return;
     }
   }
-  *w = initial_w;
-  *h = initial_h;
 }
 
 class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {

From c8874f74a7a29729a77ccba5790bf2c71f583f15 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@gmail.com>
Date: Sat, 17 Sep 2022 08:47:28 +0900
Subject: [PATCH 437/926] quantize: increase iscan by 1

All of the assembly adds 1 to iscan to convert from
a 0 based array to the EOB value.

Add 1 to all iscan values and remove the extra
instructions from the assembly.

Change-Id: I219dd7f2bd10533ab24b206289565703176dc5e9
---
 vp9/common/vp9_scan.c                         | 293 +++++++++---------
 vp9/encoder/arm/neon/vp9_quantize_neon.c      |   4 +-
 vp9/encoder/x86/vp9_quantize_avx2.c           |  14 +-
 vp9/encoder/x86/vp9_quantize_sse2.c           |   6 -
 vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm |   4 -
 vpx_dsp/arm/highbd_quantize_neon.c            |  18 +-
 vpx_dsp/arm/quantize_neon.c                   |  18 +-
 vpx_dsp/loongarch/quantize_lsx.c              |  13 +-
 vpx_dsp/ppc/quantize_vsx.c                    |  24 +-
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c     |   5 +-
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c     |   8 +-
 vpx_dsp/x86/quantize_avx.c                    |  12 +-
 vpx_dsp/x86/quantize_avx2.c                   |  11 +-
 vpx_dsp/x86/quantize_sse2.c                   |   5 +-
 vpx_dsp/x86/quantize_sse2.h                   |   8 +-
 vpx_dsp/x86/quantize_ssse3.c                  |  11 +-
 16 files changed, 197 insertions(+), 257 deletions(-)

diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
index 0fef263510..8bea61dea6 100644
--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
@@ -511,180 +511,181 @@ DECLARE_ALIGNED(16, static const int16_t,
   959, 990,  991, 1022, 0,   0,
 };
 
+// Add 1 to iscan values. This represents the EOB position instead of the index.
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = {
-  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+  1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = {
-  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+  1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = {
-  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+  1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = {
-  0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
-  2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
-  6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
-  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+  1,  4,  9,  16, 23, 33, 41, 48, 2,  6,  12, 19, 27, 35, 45, 52,
+  3,  8,  14, 21, 29, 39, 47, 55, 5,  11, 17, 25, 32, 42, 51, 57,
+  7,  13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61,
+  15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = {
-  0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
-  6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
-  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
-  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+  1,  2,  3,  6,  9,  13, 20, 25, 4,  5,  8,  11, 16, 21, 31, 40,
+  7,  10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53,
+  19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60,
+  33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = {
-  0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
-  3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
-  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
-  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+  1,  3,  6,  10, 15, 23, 32, 38, 2,  5,  9,  14, 20, 27, 39, 45,
+  4,  7,  11, 18, 25, 31, 43, 50, 8,  12, 16, 22, 30, 37, 48, 54,
+  13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61,
+  26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = {
-  0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
-  1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
-  2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
-  3,  10, 18,  29,  41,  55,  71,  89,  103, 119, 141, 159, 176, 194, 208, 218,
-  5,  12, 21,  32,  45,  58,  74,  93,  104, 123, 144, 164, 179, 196, 210, 223,
-  7,  15, 26,  37,  49,  63,  78,  96,  112, 129, 146, 166, 182, 200, 215, 228,
-  9,  19, 28,  39,  54,  69,  86,  102, 117, 132, 151, 170, 187, 206, 220, 230,
-  13, 24, 35,  46,  60,  73,  91,  108, 122, 137, 154, 174, 189, 207, 224, 235,
-  17, 30, 40,  53,  66,  82,  98,  115, 126, 142, 161, 180, 197, 213, 227, 237,
-  22, 36, 48,  62,  76,  92,  105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
-  27, 44, 56,  70,  84,  99,  113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
-  33, 51, 68,  79,  94,  110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
-  42, 61, 77,  90,  106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
-  50, 72, 87,  100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
-  57, 80, 97,  111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
-  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+  1,  5,  12,  21,  32,  44,  60,  76,  86,  110, 131, 151, 166, 182, 196, 199,
+  2,  7,  15,  24,  35,  48,  65,  82,  96,  115, 136, 154, 172, 189, 202, 213,
+  3,  9,  17,  26,  39,  53,  68,  84,  102, 117, 137, 158, 173, 191, 206, 217,
+  4,  11, 19,  30,  42,  56,  72,  90,  104, 120, 142, 160, 177, 195, 209, 219,
+  6,  13, 22,  33,  46,  59,  75,  94,  105, 124, 145, 165, 180, 197, 211, 224,
+  8,  16, 27,  38,  50,  64,  79,  97,  113, 130, 147, 167, 183, 201, 216, 229,
+  10, 20, 29,  40,  55,  70,  87,  103, 118, 133, 152, 171, 188, 207, 221, 231,
+  14, 25, 36,  47,  61,  74,  92,  109, 123, 138, 155, 175, 190, 208, 225, 236,
+  18, 31, 41,  54,  67,  83,  99,  116, 127, 143, 162, 181, 198, 214, 228, 238,
+  23, 37, 49,  63,  77,  93,  106, 121, 134, 148, 168, 187, 204, 220, 233, 241,
+  28, 45, 57,  71,  85,  100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245,
+  34, 52, 69,  80,  95,  111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248,
+  43, 62, 78,  91,  107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252,
+  51, 73, 88,  101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254,
+  58, 81, 98,  112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255,
+  66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = {
-  0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
-  86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
-  115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
-  119, 142, 167, 14,  16,  20,  26,  31,  37,  44,  53,  61,  73,  85,  100,
-  116, 135, 161, 185, 21,  24,  30,  35,  40,  47,  55,  65,  74,  81,  94,
-  112, 133, 154, 179, 205, 28,  34,  39,  45,  50,  58,  67,  77,  87,  96,
-  106, 121, 146, 169, 196, 212, 41,  46,  49,  56,  63,  70,  79,  90,  98,
-  107, 122, 138, 159, 182, 207, 222, 52,  57,  62,  69,  75,  83,  93,  102,
-  110, 120, 134, 150, 176, 195, 215, 226, 66,  71,  78,  82,  91,  97,  108,
-  113, 127, 136, 148, 168, 188, 202, 221, 232, 80,  89,  92,  101, 105, 114,
-  125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95,  104, 109, 117, 123,
-  128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
-  140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
-  145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
-  156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
-  163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
-  158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
-  255,
+  1,   2,   3,   5,   7,   10,  13,  18,  23,  30,  37,  44,  55,  65,  77,
+  87,  4,   6,   8,   12,  16,  20,  26,  33,  39,  49,  60,  69,  85,  100,
+  116, 131, 9,   11,  14,  19,  24,  28,  34,  43,  52,  61,  73,  89,  104,
+  120, 143, 168, 15,  17,  21,  27,  32,  38,  45,  54,  62,  74,  86,  101,
+  117, 136, 162, 186, 22,  25,  31,  36,  41,  48,  56,  66,  75,  82,  95,
+  113, 134, 155, 180, 206, 29,  35,  40,  46,  51,  59,  68,  78,  88,  97,
+  107, 122, 147, 170, 197, 213, 42,  47,  50,  57,  64,  71,  80,  91,  99,
+  108, 123, 139, 160, 183, 208, 223, 53,  58,  63,  70,  76,  84,  94,  103,
+  111, 121, 135, 151, 177, 196, 216, 227, 67,  72,  79,  83,  92,  98,  109,
+  114, 128, 137, 149, 169, 189, 203, 222, 233, 81,  90,  93,  102, 106, 115,
+  126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96,  105, 110, 118, 124,
+  129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130,
+  141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138,
+  146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150,
+  157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153,
+  164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254,
+  159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255,
+  256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = {
-  0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
-  179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
-  178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
-  164, 186, 201, 6,   12,  16,  23,  31,  39,  53,  64,  78,  92,  110, 127,
-  153, 169, 193, 208, 10,  14,  19,  28,  37,  47,  58,  67,  84,  98,  114,
-  133, 161, 176, 198, 214, 15,  21,  26,  34,  43,  52,  65,  77,  91,  106,
-  120, 140, 165, 185, 205, 221, 22,  27,  32,  41,  48,  60,  73,  85,  99,
-  116, 130, 151, 175, 190, 211, 225, 29,  35,  42,  49,  59,  69,  81,  95,
-  108, 125, 139, 155, 182, 197, 217, 229, 38,  45,  51,  61,  68,  80,  93,
-  105, 118, 134, 150, 168, 191, 207, 223, 234, 50,  56,  63,  74,  83,  94,
-  109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62,  70,  76,  87,  97,
-  107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75,  82,  90,  102,
-  112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89,  100, 111,
-  123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
-  126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
-  135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
-  137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
-  255,
+  1,   3,   6,   10,  18,  25,  37,  45,  56,  73,  89,  105, 129, 144, 167,
+  180, 2,   5,   9,   14,  21,  31,  41,  55,  67,  80,  97,  114, 142, 155,
+  179, 197, 4,   8,   12,  19,  26,  34,  47,  58,  72,  87,  102, 120, 149,
+  165, 187, 202, 7,   13,  17,  24,  32,  40,  54,  65,  79,  93,  111, 128,
+  154, 170, 194, 209, 11,  15,  20,  29,  38,  48,  59,  68,  85,  99,  115,
+  134, 162, 177, 199, 215, 16,  22,  27,  35,  44,  53,  66,  78,  92,  107,
+  121, 141, 166, 186, 206, 222, 23,  28,  33,  42,  49,  61,  74,  86,  100,
+  117, 131, 152, 176, 191, 212, 226, 30,  36,  43,  50,  60,  70,  82,  96,
+  109, 126, 140, 156, 183, 198, 218, 230, 39,  46,  52,  62,  69,  81,  94,
+  106, 119, 135, 151, 169, 192, 208, 224, 235, 51,  57,  64,  75,  84,  95,
+  110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63,  71,  77,  88,  98,
+  108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76,  83,  91,  103,
+  113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90,  101, 112,
+  124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116,
+  127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122,
+  136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255,
+  138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254,
+  256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
-  0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
-  170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
-  377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
-  58,   74,   92,   112,  133,  158,  184,  203,  215,  222,  228,  234,  237,
-  256,  274,  298,  317,  355,  376,  404,  426,  470,  494,  526,  551,  3,
-  7,    12,   18,   28,   36,   52,   64,   82,   102,  118,  142,  164,  189,
-  208,  217,  224,  231,  235,  238,  273,  297,  316,  329,  375,  403,  425,
-  440,  493,  525,  550,  567,  6,    11,   16,   23,   31,   43,   60,   73,
-  90,   109,  126,  150,  173,  196,  211,  220,  226,  232,  236,  239,  296,
-  315,  328,  335,  402,  424,  439,  447,  524,  549,  566,  575,  9,    14,
-  19,   29,   37,   50,   65,   78,   95,   116,  134,  157,  179,  201,  214,
-  223,  244,  255,  272,  295,  341,  354,  374,  401,  454,  469,  492,  523,
-  582,  596,  617,  645,  13,   20,   26,   35,   44,   54,   72,   85,   105,
-  123,  140,  163,  182,  205,  216,  225,  254,  271,  294,  314,  353,  373,
-  400,  423,  468,  491,  522,  548,  595,  616,  644,  666,  21,   27,   33,
-  42,   53,   63,   80,   94,   113,  132,  151,  172,  190,  209,  218,  227,
-  270,  293,  313,  327,  372,  399,  422,  438,  490,  521,  547,  565,  615,
-  643,  665,  680,  24,   32,   39,   48,   57,   71,   88,   104,  120,  139,
-  159,  178,  197,  212,  221,  230,  292,  312,  326,  334,  398,  421,  437,
-  446,  520,  546,  564,  574,  642,  664,  679,  687,  34,   40,   46,   56,
-  68,   81,   96,   111,  130,  147,  167,  186,  243,  253,  269,  291,  340,
-  352,  371,  397,  453,  467,  489,  519,  581,  594,  614,  641,  693,  705,
-  723,  747,  41,   49,   55,   67,   77,   91,   107,  124,  138,  161,  177,
-  194,  252,  268,  290,  311,  351,  370,  396,  420,  466,  488,  518,  545,
-  593,  613,  640,  663,  704,  722,  746,  765,  51,   59,   66,   76,   89,
-  99,   119,  131,  149,  168,  181,  200,  267,  289,  310,  325,  369,  395,
-  419,  436,  487,  517,  544,  563,  612,  639,  662,  678,  721,  745,  764,
-  777,  61,   69,   75,   87,   100,  114,  129,  144,  162,  180,  191,  207,
-  288,  309,  324,  333,  394,  418,  435,  445,  516,  543,  562,  573,  638,
-  661,  677,  686,  744,  763,  776,  783,  70,   79,   86,   97,   108,  122,
-  137,  155,  242,  251,  266,  287,  339,  350,  368,  393,  452,  465,  486,
-  515,  580,  592,  611,  637,  692,  703,  720,  743,  788,  798,  813,  833,
-  84,   93,   103,  110,  125,  141,  154,  171,  250,  265,  286,  308,  349,
-  367,  392,  417,  464,  485,  514,  542,  591,  610,  636,  660,  702,  719,
-  742,  762,  797,  812,  832,  848,  98,   106,  115,  127,  143,  156,  169,
-  185,  264,  285,  307,  323,  366,  391,  416,  434,  484,  513,  541,  561,
-  609,  635,  659,  676,  718,  741,  761,  775,  811,  831,  847,  858,  117,
-  128,  136,  148,  160,  175,  188,  198,  284,  306,  322,  332,  390,  415,
-  433,  444,  512,  540,  560,  572,  634,  658,  675,  685,  740,  760,  774,
-  782,  830,  846,  857,  863,  135,  146,  152,  165,  241,  249,  263,  283,
-  338,  348,  365,  389,  451,  463,  483,  511,  579,  590,  608,  633,  691,
-  701,  717,  739,  787,  796,  810,  829,  867,  875,  887,  903,  153,  166,
-  174,  183,  248,  262,  282,  305,  347,  364,  388,  414,  462,  482,  510,
-  539,  589,  607,  632,  657,  700,  716,  738,  759,  795,  809,  828,  845,
-  874,  886,  902,  915,  176,  187,  195,  202,  261,  281,  304,  321,  363,
-  387,  413,  432,  481,  509,  538,  559,  606,  631,  656,  674,  715,  737,
-  758,  773,  808,  827,  844,  856,  885,  901,  914,  923,  192,  199,  206,
-  213,  280,  303,  320,  331,  386,  412,  431,  443,  508,  537,  558,  571,
-  630,  655,  673,  684,  736,  757,  772,  781,  826,  843,  855,  862,  900,
-  913,  922,  927,  240,  247,  260,  279,  337,  346,  362,  385,  450,  461,
-  480,  507,  578,  588,  605,  629,  690,  699,  714,  735,  786,  794,  807,
-  825,  866,  873,  884,  899,  930,  936,  945,  957,  246,  259,  278,  302,
-  345,  361,  384,  411,  460,  479,  506,  536,  587,  604,  628,  654,  698,
-  713,  734,  756,  793,  806,  824,  842,  872,  883,  898,  912,  935,  944,
-  956,  966,  258,  277,  301,  319,  360,  383,  410,  430,  478,  505,  535,
-  557,  603,  627,  653,  672,  712,  733,  755,  771,  805,  823,  841,  854,
-  882,  897,  911,  921,  943,  955,  965,  972,  276,  300,  318,  330,  382,
-  409,  429,  442,  504,  534,  556,  570,  626,  652,  671,  683,  732,  754,
-  770,  780,  822,  840,  853,  861,  896,  910,  920,  926,  954,  964,  971,
-  975,  336,  344,  359,  381,  449,  459,  477,  503,  577,  586,  602,  625,
-  689,  697,  711,  731,  785,  792,  804,  821,  865,  871,  881,  895,  929,
-  934,  942,  953,  977,  981,  987,  995,  343,  358,  380,  408,  458,  476,
-  502,  533,  585,  601,  624,  651,  696,  710,  730,  753,  791,  803,  820,
-  839,  870,  880,  894,  909,  933,  941,  952,  963,  980,  986,  994,  1001,
-  357,  379,  407,  428,  475,  501,  532,  555,  600,  623,  650,  670,  709,
-  729,  752,  769,  802,  819,  838,  852,  879,  893,  908,  919,  940,  951,
-  962,  970,  985,  993,  1000, 1005, 378,  406,  427,  441,  500,  531,  554,
-  569,  622,  649,  669,  682,  728,  751,  768,  779,  818,  837,  851,  860,
-  892,  907,  918,  925,  950,  961,  969,  974,  992,  999,  1004, 1007, 448,
-  457,  474,  499,  576,  584,  599,  621,  688,  695,  708,  727,  784,  790,
-  801,  817,  864,  869,  878,  891,  928,  932,  939,  949,  976,  979,  984,
-  991,  1008, 1010, 1013, 1017, 456,  473,  498,  530,  583,  598,  620,  648,
-  694,  707,  726,  750,  789,  800,  816,  836,  868,  877,  890,  906,  931,
-  938,  948,  960,  978,  983,  990,  998,  1009, 1012, 1016, 1020, 472,  497,
-  529,  553,  597,  619,  647,  668,  706,  725,  749,  767,  799,  815,  835,
-  850,  876,  889,  905,  917,  937,  947,  959,  968,  982,  989,  997,  1003,
-  1011, 1015, 1019, 1022, 496,  528,  552,  568,  618,  646,  667,  681,  724,
-  748,  766,  778,  814,  834,  849,  859,  888,  904,  916,  924,  946,  958,
-  967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
+  1,    3,    6,    11,   18,   26,   39,   48,   63,   84,   102,  122,  146,
+  171,  194,  205,  211,  220,  230,  234,  246,  258,  276,  300,  343,  357,
+  378,  406,  456,  472,  496,  528,  2,    5,    9,    16,   23,   31,   46,
+  59,   75,   93,   113,  134,  159,  185,  204,  216,  223,  229,  235,  238,
+  257,  275,  299,  318,  356,  377,  405,  427,  471,  495,  527,  552,  4,
+  8,    13,   19,   29,   37,   53,   65,   83,   103,  119,  143,  165,  190,
+  209,  218,  225,  232,  236,  239,  274,  298,  317,  330,  376,  404,  426,
+  441,  494,  526,  551,  568,  7,    12,   17,   24,   32,   44,   61,   74,
+  91,   110,  127,  151,  174,  197,  212,  221,  227,  233,  237,  240,  297,
+  316,  329,  336,  403,  425,  440,  448,  525,  550,  567,  576,  10,   15,
+  20,   30,   38,   51,   66,   79,   96,   117,  135,  158,  180,  202,  215,
+  224,  245,  256,  273,  296,  342,  355,  375,  402,  455,  470,  493,  524,
+  583,  597,  618,  646,  14,   21,   27,   36,   45,   55,   73,   86,   106,
+  124,  141,  164,  183,  206,  217,  226,  255,  272,  295,  315,  354,  374,
+  401,  424,  469,  492,  523,  549,  596,  617,  645,  667,  22,   28,   34,
+  43,   54,   64,   81,   95,   114,  133,  152,  173,  191,  210,  219,  228,
+  271,  294,  314,  328,  373,  400,  423,  439,  491,  522,  548,  566,  616,
+  644,  666,  681,  25,   33,   40,   49,   58,   72,   89,   105,  121,  140,
+  160,  179,  198,  213,  222,  231,  293,  313,  327,  335,  399,  422,  438,
+  447,  521,  547,  565,  575,  643,  665,  680,  688,  35,   41,   47,   57,
+  69,   82,   97,   112,  131,  148,  168,  187,  244,  254,  270,  292,  341,
+  353,  372,  398,  454,  468,  490,  520,  582,  595,  615,  642,  694,  706,
+  724,  748,  42,   50,   56,   68,   78,   92,   108,  125,  139,  162,  178,
+  195,  253,  269,  291,  312,  352,  371,  397,  421,  467,  489,  519,  546,
+  594,  614,  641,  664,  705,  723,  747,  766,  52,   60,   67,   77,   90,
+  100,  120,  132,  150,  169,  182,  201,  268,  290,  311,  326,  370,  396,
+  420,  437,  488,  518,  545,  564,  613,  640,  663,  679,  722,  746,  765,
+  778,  62,   70,   76,   88,   101,  115,  130,  145,  163,  181,  192,  208,
+  289,  310,  325,  334,  395,  419,  436,  446,  517,  544,  563,  574,  639,
+  662,  678,  687,  745,  764,  777,  784,  71,   80,   87,   98,   109,  123,
+  138,  156,  243,  252,  267,  288,  340,  351,  369,  394,  453,  466,  487,
+  516,  581,  593,  612,  638,  693,  704,  721,  744,  789,  799,  814,  834,
+  85,   94,   104,  111,  126,  142,  155,  172,  251,  266,  287,  309,  350,
+  368,  393,  418,  465,  486,  515,  543,  592,  611,  637,  661,  703,  720,
+  743,  763,  798,  813,  833,  849,  99,   107,  116,  128,  144,  157,  170,
+  186,  265,  286,  308,  324,  367,  392,  417,  435,  485,  514,  542,  562,
+  610,  636,  660,  677,  719,  742,  762,  776,  812,  832,  848,  859,  118,
+  129,  137,  149,  161,  176,  189,  199,  285,  307,  323,  333,  391,  416,
+  434,  445,  513,  541,  561,  573,  635,  659,  676,  686,  741,  761,  775,
+  783,  831,  847,  858,  864,  136,  147,  153,  166,  242,  250,  264,  284,
+  339,  349,  366,  390,  452,  464,  484,  512,  580,  591,  609,  634,  692,
+  702,  718,  740,  788,  797,  811,  830,  868,  876,  888,  904,  154,  167,
+  175,  184,  249,  263,  283,  306,  348,  365,  389,  415,  463,  483,  511,
+  540,  590,  608,  633,  658,  701,  717,  739,  760,  796,  810,  829,  846,
+  875,  887,  903,  916,  177,  188,  196,  203,  262,  282,  305,  322,  364,
+  388,  414,  433,  482,  510,  539,  560,  607,  632,  657,  675,  716,  738,
+  759,  774,  809,  828,  845,  857,  886,  902,  915,  924,  193,  200,  207,
+  214,  281,  304,  321,  332,  387,  413,  432,  444,  509,  538,  559,  572,
+  631,  656,  674,  685,  737,  758,  773,  782,  827,  844,  856,  863,  901,
+  914,  923,  928,  241,  248,  261,  280,  338,  347,  363,  386,  451,  462,
+  481,  508,  579,  589,  606,  630,  691,  700,  715,  736,  787,  795,  808,
+  826,  867,  874,  885,  900,  931,  937,  946,  958,  247,  260,  279,  303,
+  346,  362,  385,  412,  461,  480,  507,  537,  588,  605,  629,  655,  699,
+  714,  735,  757,  794,  807,  825,  843,  873,  884,  899,  913,  936,  945,
+  957,  967,  259,  278,  302,  320,  361,  384,  411,  431,  479,  506,  536,
+  558,  604,  628,  654,  673,  713,  734,  756,  772,  806,  824,  842,  855,
+  883,  898,  912,  922,  944,  956,  966,  973,  277,  301,  319,  331,  383,
+  410,  430,  443,  505,  535,  557,  571,  627,  653,  672,  684,  733,  755,
+  771,  781,  823,  841,  854,  862,  897,  911,  921,  927,  955,  965,  972,
+  976,  337,  345,  360,  382,  450,  460,  478,  504,  578,  587,  603,  626,
+  690,  698,  712,  732,  786,  793,  805,  822,  866,  872,  882,  896,  930,
+  935,  943,  954,  978,  982,  988,  996,  344,  359,  381,  409,  459,  477,
+  503,  534,  586,  602,  625,  652,  697,  711,  731,  754,  792,  804,  821,
+  840,  871,  881,  895,  910,  934,  942,  953,  964,  981,  987,  995,  1002,
+  358,  380,  408,  429,  476,  502,  533,  556,  601,  624,  651,  671,  710,
+  730,  753,  770,  803,  820,  839,  853,  880,  894,  909,  920,  941,  952,
+  963,  971,  986,  994,  1001, 1006, 379,  407,  428,  442,  501,  532,  555,
+  570,  623,  650,  670,  683,  729,  752,  769,  780,  819,  838,  852,  861,
+  893,  908,  919,  926,  951,  962,  970,  975,  993,  1000, 1005, 1008, 449,
+  458,  475,  500,  577,  585,  600,  622,  689,  696,  709,  728,  785,  791,
+  802,  818,  865,  870,  879,  892,  929,  933,  940,  950,  977,  980,  985,
+  992,  1009, 1011, 1014, 1018, 457,  474,  499,  531,  584,  599,  621,  649,
+  695,  708,  727,  751,  790,  801,  817,  837,  869,  878,  891,  907,  932,
+  939,  949,  961,  979,  984,  991,  999,  1010, 1013, 1017, 1021, 473,  498,
+  530,  554,  598,  620,  648,  669,  707,  726,  750,  768,  800,  816,  836,
+  851,  877,  890,  906,  918,  938,  948,  960,  969,  983,  990,  998,  1004,
+  1012, 1016, 1020, 1023, 497,  529,  553,  569,  619,  647,  668,  682,  725,
+  749,  767,  779,  815,  835,  850,  860,  889,  905,  917,  925,  947,  959,
+  968,  974,  989,  997,  1003, 1007, 1015, 1019, 1022, 1024,
 };
 
 const scan_order vp9_default_scan_orders[TX_SIZES] = {
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 945fd522e8..b9bd1eba31 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -45,9 +45,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
                                                    int16x8_t v_eobmax,
                                                    uint16x8_t v_nz_mask) {
   const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]);
-  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
-  const int16x8_t v_nz_iscan =
-      vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan_plus1);
+  const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan);
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index 15ce71c5c6..da285be8e7 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -43,16 +43,13 @@ static VPX_FORCE_INLINE void load_fp_values_avx2(
 static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
                                                  __m256i v_eobmax,
                                                  __m256i v_mask) {
-  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
-  // typedef int32_t tran_low_t;
-  const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
-  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
 #else
-  // typedef int16_t tran_low_t;
-  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask);
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
 #endif
-  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
   return _mm256_max_epi16(v_eobmax, v_nz_iscan);
 }
 
@@ -303,8 +300,7 @@ static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
       _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
   const __m256i iscan =
       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
-  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
-  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
   return _mm256_max_epi16(eobmax, nz_iscan);
 }
 
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 4bcadaa6a1..0fd0dccc4f 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -99,9 +99,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
       iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
       iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      // Add one to convert from indices to counts
-      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
       eob = _mm_and_si128(iscan0, nzero_coeff0);
       eob1 = _mm_and_si128(iscan1, nzero_coeff1);
       eob = _mm_max_epi16(eob, eob1);
@@ -174,9 +171,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
       iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
       iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      // Add one to convert from indices to counts
-      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
       eob0 = _mm_and_si128(iscan0, nzero_coeff0);
       eob1 = _mm_and_si128(iscan1, nzero_coeff1);
       eob0 = _mm_max_epi16(eob0, eob1);
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 680acfec69..ae43a90f8b 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -91,8 +91,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
   mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m7                   ; m11 = scan[i] + 1
   pandn                           m8, m6                   ; m8 = max(eob)
   pandn                          m13, m11                  ; m13 = max(eob)
   pmaxsw                          m8, m13
@@ -141,8 +139,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
   mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m7                   ; m11 = scan[i] + 1
   pandn                          m14, m6                   ; m14 = max(eob)
   pandn                          m13, m11                  ; m13 = max(eob)
   pmaxsw                          m8, m14
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 502a9c972d..b9f72a94c5 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -101,7 +101,6 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
 
@@ -119,9 +118,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   // Process first 8 values which include a dc component.
   {
-    // Add one because the eob does not index from 0.
-    const uint16x8_t v_iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
     const int16x8_t qcoeff =
         highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
@@ -148,9 +145,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
 
     do {
-      // Add one because the eob is not its index.
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
       const int16x8_t qcoeff =
           highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
@@ -234,7 +229,6 @@ void vpx_highbd_quantize_b_32x32_neon(
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
@@ -253,9 +247,7 @@ void vpx_highbd_quantize_b_32x32_neon(
 
   // Process first 8 values which include a dc component.
   {
-    // Add one because the eob does not index from 0.
-    const uint16x8_t v_iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
     const int16x8_t qcoeff =
         highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
@@ -279,9 +271,7 @@ void vpx_highbd_quantize_b_32x32_neon(
     dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
 
     for (i = 1; i < 32 * 32 / 8; ++i) {
-      // Add one because the eob is not its index.
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
       const int16x8_t qcoeff =
           highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index dcdf588cbc..9c227d560f 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -75,7 +75,6 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan,
                          const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
 
@@ -88,9 +87,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   // Process first 8 values which include a dc component.
   {
-    // Add one because the eob does not index from 0.
-    const uint16x8_t v_iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
     const int16x8_t qcoeff =
         quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
@@ -116,9 +113,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
 
     do {
-      // Add one because the eob is not its index.
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
       const int16x8_t qcoeff =
           quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
@@ -226,7 +221,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
@@ -240,9 +234,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   // Process first 8 values which include a dc component.
   {
-    // Add one because the eob does not index from 0.
-    const uint16x8_t v_iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
     const int16x8_t qcoeff =
         quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
@@ -266,9 +258,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
 
     for (i = 1; i < 32 * 32 / 8; ++i) {
-      // Add one because the eob is not its index.
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
       const int16x8_t qcoeff =
           quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c
index 2fc33b06b7..77be0bb4fe 100644
--- a/vpx_dsp/loongarch/quantize_lsx.c
+++ b/vpx_dsp/loongarch/quantize_lsx.c
@@ -59,7 +59,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
 }
 
 static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
-                                   __m128i zbin_mask0, __m128i zbin_mask1,
                                    const int16_t *scan, int index,
                                    __m128i zero) {
   const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
@@ -68,8 +67,6 @@ static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
   __m128i scan1 = __lsx_vld(scan + index + 8, 0);
   __m128i eob0, eob1;
 
-  scan0 = __lsx_vsub_h(scan0, zbin_mask0);
-  scan1 = __lsx_vsub_h(scan1, zbin_mask1);
   eob0 = __lsx_vandn_v(zero_coeff0, scan0);
   eob1 = __lsx_vandn_v(zero_coeff1, scan1);
   return __lsx_vmax_h(eob0, eob1);
@@ -138,7 +135,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   dequant = __lsx_vilvh_d(dequant, dequant);
   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
   // AC only loop.
   while (index < n_coeffs) {
     coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -161,8 +158,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
     eob = __lsx_vmax_h(eob, eob0);
 
     index += 16;
@@ -221,7 +217,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
   dequant = __lsx_vilvh_d(dequant, dequant);
   calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
-  eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
   // AC only loop.
   for (index = 16; index < 32 * 32; index += 16) {
     coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -243,8 +239,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
                                       dqcoeff_ptr + 8 + index);
-    eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
     eob = __lsx_vmax_h(eob, eob0);
   }
 
diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c
index 7cdcbeb405..ab71f6e235 100644
--- a/vpx_dsp/ppc/quantize_vsx.c
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
   return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
 }
 
-static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff,
                                           const int16_t *iscan_ptr, int index) {
   int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
   bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
-  scan = vec_sub(scan, mask);
   return vec_andc(scan, zero_coeff);
 }
 
@@ -139,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
   vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
 
-  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
-                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+  eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, iscan_ptr, 16));
 
   if (n_coeffs > 16) {
     int index = 16;
@@ -177,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
       vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
 
-      eob =
-          vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
-      eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
-                     nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+      eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+      eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+                     nonzero_scanindex(qcoeff2, iscan_ptr, off2));
       eob = vec_max(eob, eob2);
 
       index += 24;
@@ -252,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dequant = vec_splat(dequant, 1);  // remove DC from dequant
   vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
 
-  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
-                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+  eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, iscan_ptr, 16));
 
   do {
     int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
@@ -286,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
     vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
 
-    eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
-    eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
-                   nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+    eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+    eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+                   nonzero_scanindex(qcoeff2, iscan_ptr, off2));
     eob = vec_max(eob, eob2);
 
     // 24 int16_t is 48 bytes
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index cbc715c046..8edddd637f 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -80,8 +80,7 @@ static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
       _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
   const __m256i iscan =
       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
-  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
-  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
   return _mm256_max_epi16(eobmax, nz_iscan);
 }
 
@@ -256,4 +255,4 @@ void vpx_highbd_quantize_b_32x32_avx2(
   }
 
   *eob_ptr = get_max_eob(eob);
-}
\ No newline at end of file
+}
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 1264fbed22..ae1981a834 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -25,7 +25,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
   __m128i zbins[2];
   __m128i nzbins[2];
 
@@ -89,7 +89,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
       }
     }
   }
-  *eob_ptr = eob_i + 1;
+  *eob_ptr = eob_i;
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
@@ -102,7 +102,7 @@ void vpx_highbd_quantize_b_32x32_sse2(
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
-  int i, eob = -1;
+  int i, eob = 0;
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
@@ -148,6 +148,6 @@ void vpx_highbd_quantize_b_32x32_sse2(
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
-  *eob_ptr = eob + 1;
+  *eob_ptr = eob;
 }
 #endif
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 706e4e6413..7d83527216 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -93,8 +93,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -134,8 +133,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
@@ -229,8 +227,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -272,8 +269,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
                                       dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 6fd5174876..28f7c9c7da 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -127,16 +127,13 @@ quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
                                                  __m256i v_eobmax,
                                                  __m256i v_mask) {
-  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
-  // typedef int32_t tran_low_t;
-  const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
-  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
 #else
-  // typedef int16_t tran_low_t;
-  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask);
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
 #endif
-  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
   return _mm256_max_epi16(v_eobmax, v_nz_iscan);
 }
 
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 459d95f28b..9533e7916d 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -76,7 +76,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dequant = _mm_unpackhi_epi64(dequant, dequant);
   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -106,8 +106,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index afe2f924b3..580dd883f0 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -62,11 +62,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
-// zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan.
 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
-                                   const __m128i zbin_mask0,
-                                   const __m128i zbin_mask1,
                                    const int16_t *scan, const int index,
                                    const __m128i zero) {
   const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
@@ -74,9 +71,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
   __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
   __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
   __m128i eob0, eob1;
-  // Add one to convert from indices to counts
-  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
-  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
   eob0 = _mm_andnot_si128(zero_coeff0, scan0);
   eob1 = _mm_andnot_si128(zero_coeff1, scan1);
   return _mm_max_epi16(eob0, eob1);
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 9d2a88b7bc..476230286d 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -70,7 +70,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dequant = _mm_unpackhi_epi64(dequant, dequant);
   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -98,8 +98,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
@@ -202,8 +201,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -249,8 +247,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
                                       dqcoeff_ptr + 8 + index);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 

From 00608eb1dea3206b2ea60236b5aa990d6d2c317f Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@gmail.com>
Date: Sat, 24 Sep 2022 10:55:52 +0900
Subject: [PATCH 438/926] quantize: add untested function

vp9_quantize_fp_sse2 was only tested in non-hbd
configuration. Missed when fixing this for
vpx_quantize_b_sse2.

Change-Id: Ide346e5727d74281c774f605c90d280050e0bf62
---
 test/vp9_quantize_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index a81775fd94..7bb0bee512 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -512,6 +512,8 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,

From a1ba7188a8da368a4a3d80a0096e993945a534b1 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Fri, 23 Sep 2022 09:17:18 -0700
Subject: [PATCH 439/926] vp9_rd.c quiet -Wstringop-overflow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

../libvpx/vp9/encoder/vp9_rd.c:594:20: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
  594 |         t_above[i] = !!*(const uint32_t *)&above[i];
      |         ~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../libvpx/vp9/encoder/vp9_rd.c:572:47: note: at offset [64, 254] into destination object ‘t_above’ of size [0, 16]
  572 |                               ENTROPY_CONTEXT t_above[16],
      |                               ~~~~~~~~~~~~~~~~^~~~~~~~~~~

Change-Id: Ie9ef24e685af417cdd35f6aa7284805e422b6ae2
---
 vp9/encoder/vp9_rd.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 9fa3ff1865..28f992f4b6 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -567,6 +567,12 @@ void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
   }
 }
 
+// Disable gcc 12.2 false positive warning.
+// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[16],
@@ -604,6 +610,9 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
       break;
   }
 }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 
 void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {

From f74ce37a3abcdca68e5bc549fbd1e0a0af3f79c8 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@gmail.com>
Date: Sat, 24 Sep 2022 10:53:05 +0900
Subject: [PATCH 440/926] quantize: standardize vp9_quantize_fp_sse2

Match style for vpx_quantize_b_sse2 and prepare to rewrite
ssse3 version in intrinsics.

Need to evaluate the value of threshold breakout before
going further.

Change-Id: I9cfceb1bb0dc237cd6b73fc8d41d78bba444a15b
---
 vp9/encoder/x86/vp9_quantize_sse2.c | 205 +++++++++++-----------------
 vpx_dsp/x86/quantize_sse2.h         |   9 ++
 2 files changed, 85 insertions(+), 129 deletions(-)

diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 0fd0dccc4f..da4cd9ee8f 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -16,13 +16,14 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
-  __m128i zero;
+  const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
   __m128i eob;
@@ -35,159 +36,105 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
 
   {
     __m128i coeff0, coeff1;
+    __m128i coeff0_sign, coeff1_sign;
+    __m128i qcoeff0, qcoeff1;
+    // Do DC and first 15 AC.
+    coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+    coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
-    // Setup global values
-    {
-      round = _mm_load_si128((const __m128i *)round_ptr);
-      quant = _mm_load_si128((const __m128i *)quant_ptr);
-      dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-    }
+    // Poor man's abs().
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+    qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+
+    qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+    qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+    // Reinsert signs.
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+    store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+    qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+    store_tran_low(qcoeff0, dqcoeff_ptr + n_coeffs);
+    store_tran_low(qcoeff1, dqcoeff_ptr + n_coeffs + 8);
+
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan + n_coeffs, 0, zero);
+
+    n_coeffs += 8 * 2;
+  }
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (n_coeffs < 0) {
+    __m128i coeff0, coeff1;
+    __m128i coeff0_sign, coeff1_sign;
+    __m128i qcoeff0, qcoeff1;
+
+    coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+    coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+    // Poor man's abs().
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-    {
-      __m128i coeff0_sign, coeff1_sign;
-      __m128i qcoeff0, qcoeff1;
-      __m128i qtmp0, qtmp1;
-      // Do DC and first 15 AC
-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-      // Poor man's sign extract
-      coeff0_sign = _mm_srai_epi16(coeff0, 15);
-      coeff1_sign = _mm_srai_epi16(coeff1, 15);
-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
+    if (nzflag) {
       qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-      round = _mm_unpackhi_epi64(round, round);
       qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-      quant = _mm_unpackhi_epi64(quant, quant);
-      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-      // Reinsert signs
-      qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+      // Reinsert signs.
+      qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+      qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
       store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
       store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
       coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-      dequant = _mm_unpackhi_epi64(dequant, dequant);
       coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
       store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
       store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-    }
-
-    {
-      // Scan for eob
-      __m128i zero_coeff0, zero_coeff1;
-      __m128i nzero_coeff0, nzero_coeff1;
-      __m128i iscan0, iscan1;
-      __m128i eob1;
-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      eob = _mm_and_si128(iscan0, nzero_coeff0);
-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-      eob = _mm_max_epi16(eob, eob1);
-    }
-    n_coeffs += 8 * 2;
-  }
-
-  thr = _mm_srai_epi16(dequant, 1);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
 
-  // AC only loop
-  while (n_coeffs < 0) {
-    __m128i coeff0, coeff1;
-    {
-      __m128i coeff0_sign, coeff1_sign;
-      __m128i qcoeff0, qcoeff1;
-      __m128i qtmp0, qtmp1;
-
-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-      // Poor man's sign extract
-      coeff0_sign = _mm_srai_epi16(coeff0, 15);
-      coeff1_sign = _mm_srai_epi16(coeff1, 15);
-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-      nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-               _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-      if (nzflag) {
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      } else {
-        store_zero_tran_low(qcoeff_ptr + n_coeffs);
-        store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
-        store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-        store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-      }
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
     }
 
     if (nzflag) {
-      // Scan for eob
-      __m128i zero_coeff0, zero_coeff1;
-      __m128i nzero_coeff0, nzero_coeff1;
-      __m128i iscan0, iscan1;
-      __m128i eob0, eob1;
-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-      eob0 = _mm_max_epi16(eob0, eob1);
+      const __m128i eob0 =
+          scan_for_eob(&coeff0, &coeff1, iscan + n_coeffs, 0, zero);
       eob = _mm_max_epi16(eob, eob0);
     }
     n_coeffs += 8 * 2;
   }
 
-  // Accumulate EOB
-  {
-    __m128i eob_shuffled;
-    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    *eob_ptr = _mm_extract_epi16(eob, 1);
-  }
+  *eob_ptr = accumulate_eob(eob);
 }
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 580dd883f0..27bfb4e41b 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -29,6 +29,15 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
   *shift = _mm_load_si128((const __m128i *)shift_ptr);
 }
 
+static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
+                                  const int16_t *quant_ptr, __m128i *quant,
+                                  const int16_t *dequant_ptr,
+                                  __m128i *dequant) {
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+}
+
 // With ssse3 and later abs() and sign() are preferred.
 static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);

From 87c7da21c2910875f2dc478abfbe96a2246fe93b Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 19 Sep 2022 05:09:23 -0700
Subject: [PATCH 441/926] vpx_subpixel_8t_intrin_avx2.c: quiet -Wuninitialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

warning: ‘s2[3]’ may be used uninitialized
and
warning: ‘s1[3]’ may be used uninitialized

The warnings exposed unused code.

Change-Id: I75cf1f9db75e811cb42e2f143be1ad76f3e4dee9
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 32 +++--------------------
 1 file changed, 3 insertions(+), 29 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index db3c39de0f..c7d880860e 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -227,6 +227,9 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
   }
 
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
+
   for (i = output_height; i > 1; i -= 2) {
     __m256i srcRegHead2, srcRegHead3;
 
@@ -282,35 +285,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     s2[2] = s2[3];
     srcRegHead1 = srcRegHead3;
   }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    // load the last 16 bytes
-    const __m128i srcRegHead2 =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    s1[0] = _mm256_castsi128_si256(
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-    s2[0] = _mm256_castsi128_si256(
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-
-    outReg1 = convolve8_8_avx2(s1, f);
-    outReg2 = convolve8_8_avx2(s2, f);
-
-    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
-    // contain the first and second convolve result respectively
-    outReg1 = _mm_packus_epi16(outReg1, outReg2);
-
-    // average if necessary
-    if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-    }
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-  }
 }
 
 static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,

From eeea3daacbf0c3f8e1bbfd2f9b67e4eda1badafc Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@gmail.com>
Date: Sat, 1 Oct 2022 11:18:09 +0900
Subject: [PATCH 442/926] vp9 quantize: change index

In assembly it made sense to iterate using n_coeffs.
In intrinsics it's just as fast to use index and
easier to read.

Change-Id: I403c959709309dad68123d0a3d0efe183874543d
---
 vp9/encoder/x86/vp9_quantize_sse2.c | 99 ++++++++++++-----------------
 1 file changed, 42 insertions(+), 57 deletions(-)

diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index da4cd9ee8f..272e5fb079 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -26,72 +26,58 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
-  __m128i eob;
+  int index = 16;
   __m128i round, quant, dequant;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
 
   (void)scan;
 
-  coeff_ptr += n_coeffs;
-  iscan += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-
   // Setup global values.
   load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
 
-  {
-    __m128i coeff0, coeff1;
-    __m128i coeff0_sign, coeff1_sign;
-    __m128i qcoeff0, qcoeff1;
-    // Do DC and first 15 AC.
-    coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-    coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
 
-    // Poor man's abs().
-    coeff0_sign = _mm_srai_epi16(coeff0, 15);
-    coeff1_sign = _mm_srai_epi16(coeff1, 15);
-    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
-    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-    qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-    qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
 
-    round = _mm_unpackhi_epi64(round, round);
-    quant = _mm_unpackhi_epi64(quant, quant);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
 
-    qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-    qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-    // Reinsert signs.
-    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
-    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+  // Reinsert signs.
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-    store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-    store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-    qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-    dequant = _mm_unpackhi_epi64(dequant, dequant);
-    qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-    store_tran_low(qcoeff0, dqcoeff_ptr + n_coeffs);
-    store_tran_low(qcoeff1, dqcoeff_ptr + n_coeffs + 8);
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
 
-    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan + n_coeffs, 0, zero);
-
-    n_coeffs += 8 * 2;
-  }
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   thr = _mm_srai_epi16(dequant, 1);
 
   // AC only loop.
-  while (n_coeffs < 0) {
-    __m128i coeff0, coeff1;
-    __m128i coeff0_sign, coeff1_sign;
-    __m128i qcoeff0, qcoeff1;
-
-    coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-    coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
 
     // Poor man's abs().
     coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -112,28 +98,27 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
       qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
       coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
       coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
+      store_tran_low(coeff0, dqcoeff_ptr + index);
+      store_tran_low(coeff1, dqcoeff_ptr + index + 8);
     } else {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
 
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
     }
 
     if (nzflag) {
-      const __m128i eob0 =
-          scan_for_eob(&coeff0, &coeff1, iscan + n_coeffs, 0, zero);
+      const __m128i eob0 = scan_for_eob(&coeff0, &coeff1, iscan, index, zero);
       eob = _mm_max_epi16(eob, eob0);
     }
-    n_coeffs += 8 * 2;
+    index += 16;
   }
 
   *eob_ptr = accumulate_eob(eob);

From c03c882785dc96ed91799280e68f8998bec50b90 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 5 Oct 2022 07:04:27 -0700
Subject: [PATCH 443/926] Add vpx_highbd_sad16x{32,16,8}_avx2.

1.9x to 2.4x faster than the sse2 version.

Bug: b/245917257

Change-Id: I686452772f9b72233930de2207af36a0cd72e0bb
---
 test/sad_test.cc              |  11 ++++
 vpx_dsp/vpx_dsp.mk            |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |   6 +-
 vpx_dsp/x86/highbd_sad_avx2.c | 100 ++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 3 deletions(-)
 create mode 100644 vpx_dsp/x86/highbd_sad_avx2.c

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 7e84ea0dbf..cd1dd0dd02 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1064,6 +1064,17 @@ const SadMxNParam avx2_tests[] = {
   SadMxNParam(32, 64, &vpx_sad32x64_avx2),
   SadMxNParam(32, 32, &vpx_sad32x32_avx2),
   SadMxNParam(32, 16, &vpx_sad32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index f9a5c97dd2..32d21e03f7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -394,6 +394,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 endif  # CONFIG_ENCODERS
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 527d0e6e74..004afb38f8 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -956,13 +956,13 @@ ()
   specialize qw/vpx_highbd_sad32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32 sse2 neon/;
+  specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16 sse2 neon/;
+  specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8 sse2 neon/;
+  specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad8x16 sse2 neon/;
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 0000000000..36e9fa6c0f
--- /dev/null
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+  const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+  const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+                                    _mm256_extractf128_si256(t1, 1));
+  return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+}
+
+unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src8_ptr, int src_stride,
+                                      const uint8_t *ref8_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+  }
+  return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src8_ptr, int src_stride,
+                                      const uint8_t *ref8_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride,
+                                     const uint8_t *ref8_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}

From 4955b945d851cd86c287401d3bca846dae354d16 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 5 Oct 2022 14:03:55 -0700
Subject: [PATCH 444/926] Add vpx_highbd_sad32x{64,32,16}_avx2.

2.7x to 3.1x faster than the sse2 version.

Bug: b/245917257

Change-Id: Idff3284932f7ee89d036f38893205bf622a159a3
---
 test/sad_test.cc              |  9 +++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  6 ++--
 vpx_dsp/x86/highbd_sad_avx2.c | 62 ++++++++++++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index cd1dd0dd02..4712c51f6d 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1065,6 +1065,15 @@ const SadMxNParam avx2_tests[] = {
   SadMxNParam(32, 32, &vpx_sad32x32_avx2),
   SadMxNParam(32, 16, &vpx_sad32x16_avx2),
 #if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
   SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8),
   SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8),
   SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 004afb38f8..d669b9999c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -947,13 +947,13 @@ ()
   specialize qw/vpx_highbd_sad64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64 sse2 neon/;
+  specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32 sse2 neon/;
+  specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16 sse2 neon/;
+  specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 36e9fa6c0f..eb0e3eec5f 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -7,7 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <immintrin.h>  // AVX2
+#include <immintrin.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
@@ -19,6 +19,66 @@ static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
   return (unsigned int)_mm_cvtsi128_si32(sum);
 }
 
+static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD32XN(n)                                                    \
+  unsigned int vpx_highbd_sad32x##n##_avx2(                                  \
+      const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr,      \
+      int ref_stride) {                                                      \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);                           \
+    __m256i sums_32 = _mm256_setzero_si256();                                \
+    int i;                                                                   \
+                                                                             \
+    for (i = 0; i < (n / 8); ++i) {                                          \
+      __m256i sums_16 = _mm256_setzero_si256();                              \
+                                                                             \
+      highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);         \
+                                                                             \
+      /* sums_16 will outrange after 8 rows, so add current sums_16 to       \
+       * sums_32*/                                                           \
+      sums_32 = _mm256_add_epi32(                                            \
+          sums_32,                                                           \
+          _mm256_add_epi32(                                                  \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+                                                                             \
+      src += src_stride << 3;                                                \
+      ref += ref_stride << 3;                                                \
+    }                                                                        \
+    return calc_final(sums_32);                                              \
+  }
+
+// 32x64
+HIGHBD_SAD32XN(64)
+
+// 32x32
+HIGHBD_SAD32XN(32)
+
+// 32x16
+HIGHBD_SAD32XN(16)
+
 static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
                                             uint16_t *ref, int ref_stride,

From 06b09ebd351deb35b5bdcf387904dcbecc3da02f Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Fri, 7 Oct 2022 05:53:50 -0700
Subject: [PATCH 445/926] Add vpx_highbd_sad64x{64,32}_avx2.

~2.8x faster than the sse2 version.

Bug: b/245917257

Change-Id: Ibc8e5d030ec145c9a9b742fff98fbd9131c9ede4
---
 test/sad_test.cc              | 20 +++++++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  4 +--
 vpx_dsp/x86/highbd_sad_avx2.c | 65 +++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 4712c51f6d..a8f04e6eb8 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1065,21 +1065,29 @@ const SadMxNParam avx2_tests[] = {
   SadMxNParam(32, 32, &vpx_sad32x32_avx2),
   SadMxNParam(32, 16, &vpx_sad32x16_avx2),
 #if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8),
   SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8),
   SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8),
   SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8),
-  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
-  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
-  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
-  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
-  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
-  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
   SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8),
   SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8),
   SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
   SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10),
   SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10),
   SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
   SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12),
   SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12),
   SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d669b9999c..34ee98197a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -941,10 +941,10 @@ ()
   # Single block SAD
   #
   add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64 sse2 neon/;
+  specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32 sse2 neon/;
+  specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index eb0e3eec5f..12ef2eb3e9 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -19,6 +19,71 @@ static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
   return (unsigned int)_mm_cvtsi128_si32(sum);
 }
 
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD64XN(n)                                                    \
+  unsigned int vpx_highbd_sad64x##n##_avx2(                                  \
+      const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr,      \
+      int ref_stride) {                                                      \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);                           \
+    __m256i sums_32 = _mm256_setzero_si256();                                \
+    int i;                                                                   \
+                                                                             \
+    for (i = 0; i < (n / 2); ++i) {                                          \
+      __m256i sums_16 = _mm256_setzero_si256();                              \
+                                                                             \
+      highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);         \
+                                                                             \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to       \
+       * sums_32*/                                                           \
+      sums_32 = _mm256_add_epi32(                                            \
+          sums_32,                                                           \
+          _mm256_add_epi32(                                                  \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+                                                                             \
+      src += src_stride << 1;                                                \
+      ref += ref_stride << 1;                                                \
+    }                                                                        \
+    return calc_final(sums_32);                                              \
+  }
+
+// 64x64
+HIGHBD_SAD64XN(64)
+
+// 64x32
+HIGHBD_SAD64XN(32)
+
 static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
                                             uint16_t *ref, int ref_stride,

From 2d87b886a3f3d776ae13c3e36101a6e19f00bed6 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Thu, 6 Oct 2022 10:26:05 +0000
Subject: [PATCH 446/926] [NEON] highbd partial DCT functions

Change-Id: I7dd4e698469562f5b1f948cc36f8403b490dcb6a
---
 test/dct_partial_test.cc        | 10 ++++-
 vpx_dsp/arm/fdct_partial_neon.c | 65 +++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl    |  4 ++
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc
index 8d0e3a912e..e57fa0f48b 100644
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@@ -145,11 +145,17 @@ INSTANTIATE_TEST_SUITE_P(
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     NEON, PartialFdctTest,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
-                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10),
                       make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
 #else
 INSTANTIATE_TEST_SUITE_P(
diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c
index 0a1cdca41d..718dba0d91 100644
--- a/vpx_dsp/arm/fdct_partial_neon.c
+++ b/vpx_dsp/arm/fdct_partial_neon.c
@@ -101,3 +101,68 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
   output[0] = (tran_low_t)(sum >> 3);
   output[1] = 0;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+                                 int stride) {
+  int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                               vdupq_n_s32(0) };
+  int32_t sum;
+
+  int r = 0;
+  do {
+    const int16x8_t a = vld1q_s16(input);
+    const int16x8_t b = vld1q_s16(input + 8);
+    input += stride;
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b));
+    r++;
+  } while (r < 16);
+
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+  partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+  sum = horizontal_add_int32x4(partial_sum[0]);
+
+  output[0] = (tran_low_t)(sum >> 1);
+  output[1] = 0;
+}
+
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+                                 int stride) {
+  int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                               vdupq_n_s32(0) };
+
+  int32_t sum;
+
+  int r = 0;
+  do {
+    const int16x8_t a0 = vld1q_s16(input);
+    const int16x8_t a1 = vld1q_s16(input + 8);
+    const int16x8_t a2 = vld1q_s16(input + 16);
+    const int16x8_t a3 = vld1q_s16(input + 24);
+    input += stride;
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0));
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3));
+    r++;
+  } while (r < 32);
+
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+  partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+  sum = horizontal_add_int32x4(partial_sum[0]);
+
+  output[0] = (tran_low_t)(sum >> 3);
+  output[1] = 0;
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 004afb38f8..5dad78c950 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -527,6 +527,8 @@ ()
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4_1 sse2 neon/;
+  specialize qw/vpx_highbd_fdct4x4_1 neon/;
+  $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8 neon sse2/;
@@ -563,6 +565,7 @@ ()
   specialize qw/vpx_highbd_fdct16x16 sse2/;
 
   add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct16x16_1 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct32x32 sse2/;
@@ -571,6 +574,7 @@ ()
   specialize qw/vpx_highbd_fdct32x32_rd sse2/;
 
   add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32_1 neon/;
 } else {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;

From 6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Thu, 6 Oct 2022 10:58:27 +0000
Subject: [PATCH 447/926] [NEON] move transpose_8x8 to reuse

Change-Id: I3915b6c9971aedaac9c23f21fdb88bc271216208
---
 vpx_dsp/arm/fdct16x16_neon.c |  6 +++---
 vpx_dsp/arm/fdct16x16_neon.h | 36 ------------------------------------
 vpx_dsp/arm/transpose_neon.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 67f43246aa..5cccb6a64a 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -46,8 +46,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
-  transpose_8x8(&temp0[0], &temp2[0]);
-  transpose_8x8(&temp1[0], &temp2[8]);
+  transpose_s16_8x8_new(&temp0[0], &temp2[0]);
+  transpose_s16_8x8_new(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3, 1);
   vpx_fdct16x16_body(temp3, temp2);
@@ -61,7 +61,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
-  transpose_8x8(&temp0[8], &temp1[0]);
+  transpose_s16_8x8_new(&temp0[8], &temp1[0]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index 0dd21153fc..5ce74cdf41 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -174,42 +174,6 @@ static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
   *sub = vcombine_s16(rounded2, rounded3);
 }
 
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
-                                 int16x8_t *b /*[8]*/) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
 // Main body of fdct16x16.
 static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
                                int16x8_t *out /*[16]*/) {
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index c098ad31b6..bf06d6abe2 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -568,6 +568,40 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
 
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                                      int16x8_t *a2, int16x8_t *a3,
                                      int16x8_t *a4, int16x8_t *a5,

From af274914f2187de21ea7cf29d67756ade06e9760 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 10 Oct 2022 12:20:37 -0700
Subject: [PATCH 448/926] SADavgTest: Add speed test.

Change-Id: Ie14c0f6d15f410adf749f7ab74cf9f2bf35f3d5f
---
 test/sad_test.cc | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 4712c51f6d..b7bf2fc4ca 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -317,7 +317,7 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
   }
 };
 
-class SADavgTest : public SADTestBase<SadMxNAvgParam> {
+class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
  public:
   SADavgTest() : SADTestBase(GetParam()) {}
 
@@ -338,6 +338,11 @@ class SADavgTest : public SADTestBase<SadMxNAvgParam> {
 
     ASSERT_EQ(reference_sad, exp_sad);
   }
+
+  void Run() {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_, second_pred_);
+  }
 };
 
 TEST_P(SADTest, MaxRef) {
@@ -437,6 +442,19 @@ TEST_P(SADavgTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADavgTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, params_.width);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, mask_);

From f538a022441bdb760c3b8ad835e209a71e31d8b9 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Thu, 6 Oct 2022 13:05:01 +0000
Subject: [PATCH 449/926] [NEON] Move helper functions for reuse

Move all butterfly functions to fdct_neon.h
Slightly optimize load/scale/cross functions
in fdct 16x16.
These will be reused in highbd variants.

Change-Id: I28b6e0cc240304bab6b94d9c3f33cca77b8cb073
---
 vpx_dsp/arm/fdct16x16_neon.c                  |  12 +-
 vpx_dsp/arm/fdct16x16_neon.h                  | 135 +++++-----
 vpx_dsp/arm/fdct32x32_neon.c                  | 231 +++---------------
 vpx_dsp/arm/{fdct_neon.c => fdct4x4_neon.c}   |   0
 .../arm/{fwd_txfm_neon.c => fdct8x8_neon.c}   |   0
 vpx_dsp/arm/fdct_neon.h                       | 130 ++++++++++
 vpx_dsp/vpx_dsp.mk                            |   4 +-
 7 files changed, 231 insertions(+), 281 deletions(-)
 rename vpx_dsp/arm/{fdct_neon.c => fdct4x4_neon.c} (100%)
 rename vpx_dsp/arm/{fwd_txfm_neon.c => fdct8x8_neon.c} (100%)

diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 5cccb6a64a..0b0ce223db 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -35,13 +35,13 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp3[16];
 
   // Left half.
-  load(input, stride, temp0);
-  cross_input(temp0, temp1, 0);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp1);
   vpx_fdct16x16_body(temp1, temp0);
 
   // Right half.
-  load(input + 8, stride, temp1);
-  cross_input(temp1, temp2, 0);
+  load_cross(input + 8, stride, temp1);
+  scale_input(temp1, temp2);
   vpx_fdct16x16_body(temp2, temp1);
 
   // Transpose top left and top right quarters into one contiguous location to
@@ -49,7 +49,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   transpose_s16_8x8_new(&temp0[0], &temp2[0]);
   transpose_s16_8x8_new(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
-  cross_input(temp2, temp3, 1);
+  cross_input(temp2, temp3);
   vpx_fdct16x16_body(temp3, temp2);
   transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
                     &temp2[5], &temp2[6], &temp2[7]);
@@ -65,7 +65,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
-  cross_input(temp1, temp0, 1);
+  cross_input(temp1, temp0);
   vpx_fdct16x16_body(temp0, temp1);
   transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
                     &temp1[5], &temp1[6], &temp1[7]);
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index 5ce74cdf41..7fc2c6e7e8 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -13,6 +13,8 @@
 
 #include <arm_neon.h>
 
+#include "fdct_neon.h"
+
 static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
   b[0] = vld1q_s16(a);
   a += stride;
@@ -72,45 +74,67 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
 // To maybe reduce register usage this could be combined with the load() step to
 // get the first 4 and last 4 values, cross those, then load the middle 8 values
 // and cross them.
+static INLINE void scale_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/) {
+  b[0] = vshlq_n_s16(a[0], 2);
+  b[1] = vshlq_n_s16(a[1], 2);
+  b[2] = vshlq_n_s16(a[2], 2);
+  b[3] = vshlq_n_s16(a[3], 2);
+  b[4] = vshlq_n_s16(a[4], 2);
+  b[5] = vshlq_n_s16(a[5], 2);
+  b[6] = vshlq_n_s16(a[6], 2);
+  b[7] = vshlq_n_s16(a[7], 2);
+
+  b[8] = vshlq_n_s16(a[8], 2);
+  b[9] = vshlq_n_s16(a[9], 2);
+  b[10] = vshlq_n_s16(a[10], 2);
+  b[11] = vshlq_n_s16(a[11], 2);
+  b[12] = vshlq_n_s16(a[12], 2);
+  b[13] = vshlq_n_s16(a[13], 2);
+  b[14] = vshlq_n_s16(a[14], 2);
+  b[15] = vshlq_n_s16(a[15], 2);
+}
+
 static INLINE void cross_input(const int16x8_t *a /*[16]*/,
-                               int16x8_t *b /*[16]*/, const int pass) {
-  if (pass == 0) {
-    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
-    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
-    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
-    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
-    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
-    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
-    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
-    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
+                               int16x8_t *b /*[16]*/) {
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+}
 
-    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
-    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
-    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
-    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
-    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
-    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
-    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
-    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
-  } else {
-    b[0] = vaddq_s16(a[0], a[15]);
-    b[1] = vaddq_s16(a[1], a[14]);
-    b[2] = vaddq_s16(a[2], a[13]);
-    b[3] = vaddq_s16(a[3], a[12]);
-    b[4] = vaddq_s16(a[4], a[11]);
-    b[5] = vaddq_s16(a[5], a[10]);
-    b[6] = vaddq_s16(a[6], a[9]);
-    b[7] = vaddq_s16(a[7], a[8]);
+static INLINE void load_cross(const int16_t *a, int stride,
+                              int16x8_t *b /*[16]*/) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
 
-    b[8] = vsubq_s16(a[7], a[8]);
-    b[9] = vsubq_s16(a[6], a[9]);
-    b[10] = vsubq_s16(a[5], a[10]);
-    b[11] = vsubq_s16(a[4], a[11]);
-    b[12] = vsubq_s16(a[3], a[12]);
-    b[13] = vsubq_s16(a[2], a[13]);
-    b[14] = vsubq_s16(a[1], a[14]);
-    b[15] = vsubq_s16(a[0], a[15]);
-  }
+  b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+  b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+  b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+  b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+  b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+  b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+  b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+  b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
 }
 
 // Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
@@ -135,45 +159,6 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
   a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
 }
 
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t c, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t c0,
-                                       const tran_coef_t c1, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
 // Main body of fdct16x16.
 static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
                                int16x8_t *out /*[16]*/) {
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index de74e6630b..51d81bd085 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
 
 // Most gcc 4.9 distributions outside of Android do not generate correct code
 // for this function.
@@ -194,54 +195,6 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b) {
 
 #undef STORE_S16
 
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t constant,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t constant0,
-                                       const tran_coef_t constant1,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Add 2 if positive, 1 if negative, and shift by 2.
-// In practice, subtract the sign bit, then shift with rounding.
-static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
-  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
-  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
-  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
-  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
-}
-
 static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
   int16x8_t a[32];
   int16x8_t b[32];
@@ -562,23 +515,6 @@ static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
     b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
   } while (0)
 
-// Like butterfly_one_coeff, but don't narrow results.
-static INLINE void butterfly_one_coeff_s16_s32(
-    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
-    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
-    int32x4_t *sub_hi) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
 #define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
                               add_index, sub_index)                      \
   do {                                                                   \
@@ -587,23 +523,6 @@ static INLINE void butterfly_one_coeff_s16_s32(
                                 &b##_lo[sub_index], &b##_hi[sub_index]); \
   } while (0)
 
-// Like butterfly_one_coeff, but with s32.
-static INLINE void butterfly_one_coeff_s32(
-    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
-    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
-  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
-  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
-  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
-  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
 #define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
                           sub_index)                                          \
   do {                                                                        \
@@ -613,26 +532,6 @@ static INLINE void butterfly_one_coeff_s32(
                             &b##_lo[sub_index], &b##_hi[sub_index]);          \
   } while (0)
 
-// Like butterfly_two_coeff, but with s32.
-static INLINE void butterfly_two_coeff_s32(
-    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
-    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
-    int32x4_t *sub_hi) {
-  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
-  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
-  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
-  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
-  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
-  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
-  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
-  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
 #define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
                           right_constant, b, add_index, sub_index)             \
   do {                                                                         \
@@ -643,24 +542,6 @@ static INLINE void butterfly_two_coeff_s32(
                             &b##_hi[sub_index]);                               \
   } while (0)
 
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
-                                            const int32x4_t a_hi) {
-  const int32x4_t one = vdupq_n_s32(1);
-  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
-  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
-  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
-  const int16x4_t b_lo =
-      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
-  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
-  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
-  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
-  const int16x4_t b_hi =
-      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
-  return vcombine_s16(b_lo, b_hi);
-}
-
 static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
   int16x8_t a[32];
   int16x8_t b[32];
@@ -967,16 +848,6 @@ static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
   out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
 }
 
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
-  const int16x8_t one = vdupq_n_s16(1);
-  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
-  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
-  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
-  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
-}
-
 static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
   int16x8_t a[32];
   int16x8_t b[32];
@@ -1279,42 +1150,6 @@ static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
 #undef BUTTERFLY_ONE_S32
 #undef BUTTERFLY_TWO_S32
 
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-// TODO(johannkoenig): share with other fdcts.
-static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[32];
   int16x8_t temp1[32];
@@ -1337,10 +1172,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   dct_body_first_pass(temp0, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_8x8(&temp1[0], &temp0[0]);
-  transpose_8x8(&temp2[0], &temp0[8]);
-  transpose_8x8(&temp3[0], &temp0[16]);
-  transpose_8x8(&temp4[0], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1355,10 +1190,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_8x8(&temp1[8], &temp0[0]);
-  transpose_8x8(&temp2[8], &temp0[8]);
-  transpose_8x8(&temp3[8], &temp0[16]);
-  transpose_8x8(&temp4[8], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1373,10 +1208,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_8x8(&temp1[16], &temp0[0]);
-  transpose_8x8(&temp2[16], &temp0[8]);
-  transpose_8x8(&temp3[16], &temp0[16]);
-  transpose_8x8(&temp4[16], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1391,10 +1226,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_8x8(&temp1[24], &temp0[0]);
-  transpose_8x8(&temp2[24], &temp0[8]);
-  transpose_8x8(&temp3[24], &temp0[16]);
-  transpose_8x8(&temp4[24], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1432,10 +1267,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   dct_body_first_pass(temp0, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_8x8(&temp1[0], &temp0[0]);
-  transpose_8x8(&temp2[0], &temp0[8]);
-  transpose_8x8(&temp3[0], &temp0[16]);
-  transpose_8x8(&temp4[0], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1450,10 +1285,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_8x8(&temp1[8], &temp0[0]);
-  transpose_8x8(&temp2[8], &temp0[8]);
-  transpose_8x8(&temp3[8], &temp0[16]);
-  transpose_8x8(&temp4[8], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1468,10 +1303,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_8x8(&temp1[16], &temp0[0]);
-  transpose_8x8(&temp2[16], &temp0[8]);
-  transpose_8x8(&temp3[16], &temp0[16]);
-  transpose_8x8(&temp4[16], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1486,10 +1321,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_8x8(&temp1[24], &temp0[0]);
-  transpose_8x8(&temp2[24], &temp0[8]);
-  transpose_8x8(&temp3[24], &temp0[16]);
-  transpose_8x8(&temp4[24], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
similarity index 100%
rename from vpx_dsp/arm/fdct_neon.c
rename to vpx_dsp/arm/fdct4x4_neon.c
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
similarity index 100%
rename from vpx_dsp/arm/fwd_txfm_neon.c
rename to vpx_dsp/arm/fdct8x8_neon.c
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 28d7d86bf8..056cae4083 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -13,6 +13,136 @@
 
 #include <arm_neon.h>
 
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t constant,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t constant0,
+                                       const tran_coef_t constant1,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+// Like butterfly_one_coeff, but don't narrow results.
+static INLINE void butterfly_one_coeff_s16_s32(
+    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Like butterfly_one_coeff, but with s32.
+static INLINE void butterfly_one_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
+  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
+  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
+  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
+  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Like butterfly_two_coeff, but with s32.
+static INLINE void butterfly_two_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
+  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
+  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
+  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
+  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
+  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
+                                            const int32x4_t a_hi) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+  const int16x4_t b_lo =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+  const int16x4_t b_hi =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+  return vcombine_s16(b_lo, b_hi);
+}
+
 static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
   const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
   const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 32d21e03f7..1fd9495cf9 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -227,11 +227,11 @@ ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
-DSP_SRCS-$(HAVE_NEON)   += arm/fdct_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct4x4_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct8x8_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct16x16_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct32x32_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct_partial_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.h

From 85484d5960b2adfde5e639ea7e3f3dcba9698e2e Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 10 Oct 2022 08:38:44 -0700
Subject: [PATCH 450/926] Add vpx_highbd_sad16x{32,16,8}_avg_avx2.

1.6x to 2.1x faster than the sse2 version.

Bug: b/245917257

Change-Id: I56c467a850297ae3abcca4b4843302bb8d5d0ac1
---
 test/sad_test.cc              | 11 ++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  6 +--
 vpx_dsp/x86/highbd_sad_avx2.c | 99 +++++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 3 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index a07b7c8a59..47df4661ce 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1119,6 +1119,17 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2),
   SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2),
   SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index cbf0e6ea8d..4d1d05d548 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1010,13 +1010,13 @@ ()
   specialize qw/vpx_highbd_sad32x16_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg sse2 neon/;
+  specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg sse2 neon/;
+  specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg sse2 neon/;
+  specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad8x16_avg sse2 neon/;
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 12ef2eb3e9..f1bab35287 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -223,3 +223,102 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride,
     return calc_final(sums_32);
   }
 }
+
+// AVG -------------------------------------------------------------------------
+
+static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+    sec += 32;
+  }
+}
+
+unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
+                                          const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+    sec += 16 << 4;
+  }
+  return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
+                                          const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}

From 50d5093a4f1f2f0fafad09e1a106b7c3c1b9d60d Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 12 Oct 2022 06:05:46 -0700
Subject: [PATCH 451/926] Add vpx_highbd_sad32x{64,32,16}_avg_avx2.

2.1x to 2.8x faster than the sse2 version.

Bug: b/245917257

Change-Id: I1aaffa4a1debbe5559784e854b8fc6fba07e5000
---
 test/sad_test.cc              |  9 +++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  6 ++--
 vpx_dsp/x86/highbd_sad_avx2.c | 67 +++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 47df4661ce..a3c2952d63 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1120,12 +1120,21 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2),
   SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2),
 #if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8),
   SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8),
   SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8),
   SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10),
   SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10),
   SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10),
   SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12),
   SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12),
   SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12),
   SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 4d1d05d548..4db6de37b6 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1001,13 +1001,13 @@ ()
   specialize qw/vpx_highbd_sad64x32_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg sse2 neon/;
+  specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg sse2 neon/;
+  specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg sse2 neon/;
+  specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/;
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index f1bab35287..24ebe4e94a 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -225,6 +225,73 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride,
 }
 
 // AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    // absolute differences between every ref/pred avg to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride;
+    ref += ref_stride;
+    sec += 32;
+  }
+}
+
+#define HIGHBD_SAD32XN_AVG(n)                                                 \
+  unsigned int vpx_highbd_sad32x##n##_avg_avx2(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
+    __m256i sums_32 = _mm256_setzero_si256();                                 \
+    int i;                                                                    \
+                                                                              \
+    for (i = 0; i < (n / 8); ++i) {                                           \
+      __m256i sums_16 = _mm256_setzero_si256();                               \
+                                                                              \
+      highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
+                                                                              \
+      /* sums_16 will outrange after 8 rows, so add current sums_16 to        \
+       * sums_32*/                                                            \
+      sums_32 = _mm256_add_epi32(                                             \
+          sums_32,                                                            \
+          _mm256_add_epi32(                                                   \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
+                                                                              \
+      src += src_stride << 3;                                                 \
+      ref += ref_stride << 3;                                                 \
+      sec += 32 << 3;                                                         \
+    }                                                                         \
+    return calc_final(sums_32);                                               \
+  }
+
+// 32x64
+HIGHBD_SAD32XN_AVG(64)
+
+// 32x32
+HIGHBD_SAD32XN_AVG(32)
+
+// 32x16
+HIGHBD_SAD32XN_AVG(16)
 
 static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
                                                 const uint16_t *src,

From 165935a1b6c3dfe2af686545188c3abebc4941d8 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Thu, 6 Oct 2022 14:53:56 +0000
Subject: [PATCH 452/926] [NEON] Add highbd FDCT 4x4 function

~80% faster than C version for both best/rt profiles.

Change-Id: Ibb3c8e1862131d2a020922420d53c66b31d5c2c3
---
 test/dct_test.cc             | 21 ++++++++--
 vpx_dsp/arm/fdct4x4_neon.c   | 38 ++++++++++++++++++
 vpx_dsp/arm/fdct_neon.h      | 74 ++++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  2 +-
 4 files changed, 131 insertions(+), 4 deletions(-)

diff --git a/test/dct_test.cc b/test/dct_test.cc
index 2182f87e5e..e34122ac92 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -539,6 +539,18 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT,
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_neon_func_info[] = {
+  { &fdct_wrapper<vpx_highbd_fdct4x4_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 },
+  /*  { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
+      &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
+    { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
+     &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
+    { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
+      &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
+};
+#else
 static const FuncInfo dct_neon_func_info[4] = {
   { &fdct_wrapper<vpx_fdct4x4_neon>, &idct_wrapper<vpx_idct4x4_16_add_neon>, 4,
     1 },
@@ -549,12 +561,15 @@ static const FuncInfo dct_neon_func_info[4] = {
   { &fdct_wrapper<vpx_fdct32x32_neon>,
     &idct_wrapper<vpx_idct32x32_1024_add_neon>, 32, 1 }
 };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, TransDCT,
-    ::testing::Combine(::testing::Range(0, 4),
-                       ::testing::Values(dct_neon_func_info),
-                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_neon_func_info) /
+                                             sizeof(dct_neon_func_info[0]))),
+        ::testing::Values(dct_neon_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
index 2827791f1e..11df7292d4 100644
--- a/vpx_dsp/arm/fdct4x4_neon.c
+++ b/vpx_dsp/arm/fdct4x4_neon.c
@@ -48,3 +48,41 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
     store_s16q_to_tran_low(final_output + 1 * 8, out_23);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  int i;
+  static const int32x4_t const_1000 = { 1, 0, 0, 0 };
+  const int32x4_t const_one = vdupq_n_s32(1);
+
+  // input[M * stride] * 16
+  int32x4_t in[4];
+  in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    in[0] = vaddq_s32(in[0], const_1000);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    vpx_highbd_fdct4x4_pass1_neon(in);
+  }
+  {
+    // Not quite a rounding shift. Only add 1 despite shifting by 2.
+    in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
+    in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2);
+    in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2);
+    in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2);
+
+    vst1q_s32(final_output, in[0]);
+    vst1q_s32(final_output + 4, in[1]);
+    vst1q_s32(final_output + 8, in[2]);
+    vst1q_s32(final_output + 12, in[3]);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 056cae4083..68aeab3aa3 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -340,4 +340,78 @@ static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
     // 07 17 27 37 47 57 67 77
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a,
+                                                  const int32x4_t b,
+                                                  const tran_high_t c,
+                                                  int32x4_t *add,
+                                                  int32x4_t *sub) {
+  const int32x2_t a_lo = vget_low_s32(a);
+  const int32x2_t a_hi = vget_high_s32(a);
+  const int32x2_t b_lo = vget_low_s32(b);
+  const int32x2_t b_hi = vget_high_s32(b);
+
+  const int64x2_t a64_lo = vmull_n_s32(a_lo, c);
+  const int64x2_t a64_hi = vmull_n_s32(a_hi, c);
+
+  const int64x2_t sum_lo = vmlal_n_s32(a64_lo, b_lo, c);
+  const int64x2_t sum_hi = vmlal_n_s32(a64_hi, b_hi, c);
+  const int64x2_t diff_lo = vmlsl_n_s32(a64_lo, b_lo, c);
+  const int64x2_t diff_hi = vmlsl_n_s32(a64_hi, b_hi, c);
+
+  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+static INLINE void highbd_butterfly_two_coeff_s32(
+    const int32x4_t a, const int32x4_t b, const tran_coef_t c0,
+    const tran_coef_t c1, int32x4_t *add, int32x4_t *sub) {
+  const int32x2_t a_lo = vget_low_s32(a);
+  const int32x2_t a_hi = vget_high_s32(a);
+  const int32x2_t b_lo = vget_low_s32(b);
+  const int32x2_t b_hi = vget_high_s32(b);
+
+  const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, c0);
+  const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, c0);
+  const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, c1);
+  const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, c1);
+
+  const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, c1);
+  const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, c1);
+  const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, c0);
+  const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, c0);
+
+  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+  int32x4_t out[4];
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+  const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+  const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+  const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+  highbd_butterfly_one_coeff_s32(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+  highbd_butterfly_two_coeff_s32(s_3, s_2, cospi_8_64, cospi_24_64, &out[1],
+                                 &out[3]);
+
+  transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index cbf0e6ea8d..c5514b14d3 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -552,7 +552,7 @@ ()
   specialize qw/vpx_fdct32x32_1 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct4x4 sse2/;
+  specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct8x8 sse2/;

From 7142689f00e73d461b8d00347ee84da2ee420994 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 12 Oct 2022 10:26:43 -0700
Subject: [PATCH 453/926] Add vpx_highbd_sad64x{64,32}_avg_avx2.

~2.8x faster than the sse2 version.

Bug: b/245917257

Change-Id: Ib727ba8a8c8fa4df450bafdde30ed99fd283f06d
---
 test/sad_test.cc              |  6 +++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  4 +-
 vpx_dsp/x86/highbd_sad_avx2.c | 77 +++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index a3c2952d63..29e3f57f5e 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1120,18 +1120,24 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2),
   SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2),
 #if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8),
   SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8),
   SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8),
   SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8),
   SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8),
   SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8),
   SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10),
   SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10),
   SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10),
   SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10),
   SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10),
   SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10),
   SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12),
   SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12),
   SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12),
   SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 4db6de37b6..5fe9c12879 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -995,10 +995,10 @@ ()
   add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg sse2 neon/;
+  specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg sse2 neon/;
+  specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/;
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 24ebe4e94a..7533ccfddb 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -225,6 +225,83 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride,
 }
 
 // AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
+    const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    const __m256i avg2 = _mm256_avg_epu16(r2, x2);
+    const __m256i avg3 = _mm256_avg_epu16(r3, x3);
+    // absolute differences between every ref/pred avg to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+    sec += 64;
+  }
+}
+
+#define HIGHBD_SAD64XN_AVG(n)                                                 \
+  unsigned int vpx_highbd_sad64x##n##_avg_avx2(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
+    __m256i sums_32 = _mm256_setzero_si256();                                 \
+    int i;                                                                    \
+                                                                              \
+    for (i = 0; i < (n / 2); ++i) {                                           \
+      __m256i sums_16 = _mm256_setzero_si256();                               \
+                                                                              \
+      highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
+                                                                              \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to        \
+       * sums_32*/                                                            \
+      sums_32 = _mm256_add_epi32(                                             \
+          sums_32,                                                            \
+          _mm256_add_epi32(                                                   \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
+                                                                              \
+      src += src_stride << 1;                                                 \
+      ref += ref_stride << 1;                                                 \
+      sec += 64 << 1;                                                         \
+    }                                                                         \
+    return calc_final(sums_32);                                               \
+  }
+
+// 64x64
+HIGHBD_SAD64XN_AVG(64)
+
+// 64x32
+HIGHBD_SAD64XN_AVG(32)
+
 static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
                                                 const uint16_t *src,
                                                 int src_stride, uint16_t *ref,

From a49f896352671870f38c1374f3d5329e3b60193f Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Thu, 6 Oct 2022 16:00:43 +0000
Subject: [PATCH 454/926] [NEON] Add highbd FDCT 8x8 function

50% faster than C version in best/rt profiles

Change-Id: I0f9504ed52b5d5f7722407e91108ed4056d66bc2
---
 test/dct_test.cc             |  12 +--
 vpx_dsp/arm/fdct8x8_neon.c   |  78 +++++++++++++++++++
 vpx_dsp/arm/fdct_neon.h      | 144 +++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 4 files changed, 229 insertions(+), 7 deletions(-)

diff --git a/test/dct_test.cc b/test/dct_test.cc
index e34122ac92..ff97fc7c35 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -543,12 +543,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT,
 static const FuncInfo dct_neon_func_info[] = {
   { &fdct_wrapper<vpx_highbd_fdct4x4_neon>,
     &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 },
-  /*  { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
-      &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
-    { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
-     &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
-    { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
-      &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
+  { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
+  /*    { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
+       &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
+      { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
+        &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
 };
 #else
 static const FuncInfo dct_neon_func_info[4] = {
diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
index d9161c6d38..3fb15cc175 100644
--- a/vpx_dsp/arm/fdct8x8_neon.c
+++ b/vpx_dsp/arm/fdct8x8_neon.c
@@ -66,3 +66,81 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
     store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  int i;
+
+  // input[M * stride] * 16
+  int32x4_t left[8], right[8];
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+
+  left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+  for (i = 0; i < 2; ++i) {
+    vpx_highbd_fdct8x8_pass1_neon(left, right);
+  }
+  {
+    left[0] = highbd_add_round_shift_s32(left[0]);
+    left[1] = highbd_add_round_shift_s32(left[1]);
+    left[2] = highbd_add_round_shift_s32(left[2]);
+    left[3] = highbd_add_round_shift_s32(left[3]);
+    left[4] = highbd_add_round_shift_s32(left[4]);
+    left[5] = highbd_add_round_shift_s32(left[5]);
+    left[6] = highbd_add_round_shift_s32(left[6]);
+    left[7] = highbd_add_round_shift_s32(left[7]);
+    right[0] = highbd_add_round_shift_s32(right[0]);
+    right[1] = highbd_add_round_shift_s32(right[1]);
+    right[2] = highbd_add_round_shift_s32(right[2]);
+    right[3] = highbd_add_round_shift_s32(right[3]);
+    right[4] = highbd_add_round_shift_s32(right[4]);
+    right[5] = highbd_add_round_shift_s32(right[5]);
+    right[6] = highbd_add_round_shift_s32(right[6]);
+    right[7] = highbd_add_round_shift_s32(right[7]);
+
+    // store results
+    vst1q_s32(final_output, left[0]);
+    vst1q_s32(final_output + 4, right[0]);
+    vst1q_s32(final_output + 8, left[1]);
+    vst1q_s32(final_output + 12, right[1]);
+    vst1q_s32(final_output + 16, left[2]);
+    vst1q_s32(final_output + 20, right[2]);
+    vst1q_s32(final_output + 24, left[3]);
+    vst1q_s32(final_output + 28, right[3]);
+    vst1q_s32(final_output + 32, left[4]);
+    vst1q_s32(final_output + 36, right[4]);
+    vst1q_s32(final_output + 40, left[5]);
+    vst1q_s32(final_output + 44, right[5]);
+    vst1q_s32(final_output + 48, left[6]);
+    vst1q_s32(final_output + 52, right[6]);
+    vst1q_s32(final_output + 56, left[7]);
+    vst1q_s32(final_output + 60, right[7]);
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 68aeab3aa3..c100e709d5 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -342,6 +342,20 @@ static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) {
+  const int32x2_t x_lo = vget_low_s32(x);
+  const int32x2_t x_hi = vget_high_s32(x);
+  const int64x2_t x64_lo = vmovl_s32(x_lo);
+  const int64x2_t x64_hi = vmovl_s32(x_hi);
+
+  const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63);
+  const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63);
+
+  const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo);
+  const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi);
+  return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1));
+}
+
 static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a,
                                                   const int32x4_t b,
                                                   const tran_high_t c,
@@ -413,5 +427,135 @@ static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
   in[3] = out[3];
 }
 
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // t0 = (x0 + x1) * cospi_16_64;
+  // t1 = (x0 - x1) * cospi_16_64;
+  // out[0] = (tran_low_t)fdct_round_shift(t0);
+  // out[4] = (tran_low_t)fdct_round_shift(t1);
+  highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]);
+  highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
+                                 &right[4]);
+  // t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+  // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+  // out[2] = (tran_low_t)fdct_round_shift(t2);
+  // out[6] = (tran_low_t)fdct_round_shift(t3);
+  highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
+                                 &left[2], &left[6]);
+  highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
+                                 &right[2], &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]);
+  highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+  // out[1] = (tran_low_t)fdct_round_shift(t0);
+  // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+  // out[7] = (tran_low_t)fdct_round_shift(t3);
+  highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
+                                 &left[1], &left[7]);
+  highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
+                                 &right[1], &right[7]);
+
+  // t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+  // out[5] = (tran_low_t)fdct_round_shift(t1);
+  // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  // out[3] = (tran_low_t)fdct_round_shift(t2);
+  highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
+                                 &left[5], &left[3]);
+  highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
+                                 &right[5], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  int32x4x2_t out[8];
+  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  left[0] = out[0].val[0];
+  right[0] = out[0].val[1];
+  left[1] = out[1].val[0];
+  right[1] = out[1].val[1];
+  left[2] = out[2].val[0];
+  right[2] = out[2].val[1];
+  left[3] = out[3].val[0];
+  right[3] = out[3].val[1];
+  left[4] = out[4].val[0];
+  right[4] = out[4].val[1];
+  left[5] = out[5].val[0];
+  right[5] = out[5].val[1];
+  left[6] = out[6].val[0];
+  right[6] = out[6].val[1];
+  left[7] = out[7].val[0];
+  right[7] = out[7].val[1];
+}
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c5514b14d3..e886c0ae45 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -555,7 +555,7 @@ ()
   specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct8x8 sse2/;
+  specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct8x8_1 neon/;

From 45b280eb0fa80404321100cee2de1bcea413913e Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Fri, 7 Oct 2022 15:13:29 +0000
Subject: [PATCH 455/926] [NEON] Add highbd FDCT 16x16 function

90-95% faster than C version in best/rt profiles

Change-Id: I41d5e9acdc348b57153637ec736498a25ed84c25
---
 test/dct16x16_test.cc        |  12 +-
 test/dct_test.cc             |   8 +-
 vpx_dsp/arm/fdct16x16_neon.c |  53 +++++
 vpx_dsp/arm/fdct16x16_neon.h | 395 +++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 5 files changed, 464 insertions(+), 6 deletions(-)

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 06837d809d..d4ef7ae13d 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -789,13 +789,23 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     NEON, Trans16x16DCT,
     ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
                                  &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
 #endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0,
+                   VPX_BITS_8)));
+#endif  // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
diff --git a/test/dct_test.cc b/test/dct_test.cc
index ff97fc7c35..910d288bd5 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -545,10 +545,10 @@ static const FuncInfo dct_neon_func_info[] = {
     &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 },
   { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
     &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
-  /*    { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
-       &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
-      { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
-        &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
+  { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
+  /* { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
+       &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
 };
 #else
 static const FuncInfo dct_neon_func_info[4] = {
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 0b0ce223db..d0c07d429a 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -74,5 +74,58 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output, temp1);
   store(output + 8, temp1 + 8);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  int16x8_t temp0[16];
+  int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16],
+      right3[16], right4[16];
+
+  // Left half.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  vpx_highbd_fdct16x16_body(left1, right1);
+
+  // right half.
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  vpx_highbd_fdct16x16_body(left2, right2);
+
+  // Transpose top left and top right quarters into one contiguous location to
+  // process to the top half.
+
+  transpose_s32_8x8_2(left1, right1, left3, right3);
+  transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+
+  highbd_partial_round_shift(left3, right3);
+  highbd_cross_input(left3, right3, left1, right1);
+  vpx_highbd_fdct16x16_body(left1, right1);
+
+  // Transpose bottom left and bottom right quarters into one contiguous
+  // location to process to the bottom half.
+
+  highbd_partial_round_shift(left4, right4);
+  highbd_cross_input(left4, right4, left2, right2);
+  vpx_highbd_fdct16x16_body(left2, right2);
+
+  transpose_s32_8x8_2(left1, right1, left3, right3);
+  transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+  store16_s32(output, left3);
+  output += 4;
+  store16_s32(output, right3);
+  output += 4;
+
+  store16_s32(output, left4);
+  output += 4;
+  store16_s32(output, right4);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
         // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index 7fc2c6e7e8..d99870903b 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -273,4 +273,399 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
                       &out[11]);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
+                                      int32x4_t *left /*[16]*/,
+                                      int32x4_t *right /* [16] */) {
+  left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+  left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+  right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
+                                      int32x4_t *a_right /*[16]*/,
+                                      int32x4_t *b_left /*[16]*/,
+                                      int32x4_t *b_right /*[16]*/) {
+  b_left[0] = vaddq_s32(a_left[0], a_left[15]);
+  b_left[1] = vaddq_s32(a_left[1], a_left[14]);
+  b_left[2] = vaddq_s32(a_left[2], a_left[13]);
+  b_left[3] = vaddq_s32(a_left[3], a_left[12]);
+  b_left[4] = vaddq_s32(a_left[4], a_left[11]);
+  b_left[5] = vaddq_s32(a_left[5], a_left[10]);
+  b_left[6] = vaddq_s32(a_left[6], a_left[9]);
+  b_left[7] = vaddq_s32(a_left[7], a_left[8]);
+
+  b_right[0] = vaddq_s32(a_right[0], a_right[15]);
+  b_right[1] = vaddq_s32(a_right[1], a_right[14]);
+  b_right[2] = vaddq_s32(a_right[2], a_right[13]);
+  b_right[3] = vaddq_s32(a_right[3], a_right[12]);
+  b_right[4] = vaddq_s32(a_right[4], a_right[11]);
+  b_right[5] = vaddq_s32(a_right[5], a_right[10]);
+  b_right[6] = vaddq_s32(a_right[6], a_right[9]);
+  b_right[7] = vaddq_s32(a_right[7], a_right[8]);
+
+  b_left[8] = vsubq_s32(a_left[7], a_left[8]);
+  b_left[9] = vsubq_s32(a_left[6], a_left[9]);
+  b_left[10] = vsubq_s32(a_left[5], a_left[10]);
+  b_left[11] = vsubq_s32(a_left[4], a_left[11]);
+  b_left[12] = vsubq_s32(a_left[3], a_left[12]);
+  b_left[13] = vsubq_s32(a_left[2], a_left[13]);
+  b_left[14] = vsubq_s32(a_left[1], a_left[14]);
+  b_left[15] = vsubq_s32(a_left[0], a_left[15]);
+
+  b_right[8] = vsubq_s32(a_right[7], a_right[8]);
+  b_right[9] = vsubq_s32(a_right[6], a_right[9]);
+  b_right[10] = vsubq_s32(a_right[5], a_right[10]);
+  b_right[11] = vsubq_s32(a_right[4], a_right[11]);
+  b_right[12] = vsubq_s32(a_right[3], a_right[12]);
+  b_right[13] = vsubq_s32(a_right[2], a_right[13]);
+  b_right[14] = vsubq_s32(a_right[1], a_right[14]);
+  b_right[15] = vsubq_s32(a_right[0], a_right[15]);
+}
+
+static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
+                                              int32x4_t *right /* [16] */) {
+  const int32x4_t one = vdupq_n_s32(1);
+  left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
+  right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+  left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
+  right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+  left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
+  right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+  left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
+  right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+  left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
+  right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+  left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
+  right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+  left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
+  right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+  left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
+  right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+  left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
+  right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+  left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
+  right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+  left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
+  right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+  left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
+  right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+  left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
+  right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+  left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
+  right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+  left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
+  right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+  left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+  right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
+}
+
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+                                       int32x4_t *right /*[8]*/,
+                                       int32x4_t *out_left /*[8]*/,
+                                       int32x4_t *out_right /*[8]*/) {
+  int32x4x2_t out[8];
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  out_left[0] = out[0].val[0];
+  out_left[1] = out[1].val[0];
+  out_left[2] = out[2].val[0];
+  out_left[3] = out[3].val[0];
+  out_left[4] = out[4].val[0];
+  out_left[5] = out[5].val[0];
+  out_left[6] = out[6].val[0];
+  out_left[7] = out[7].val[0];
+  out_right[0] = out[0].val[1];
+  out_right[1] = out[1].val[1];
+  out_right[2] = out[2].val[1];
+  out_right[3] = out[3].val[1];
+  out_right[4] = out[4].val[1];
+  out_right[5] = out[5].val[1];
+  out_right[6] = out[6].val[1];
+  out_right[7] = out[7].val[1];
+}
+
+// Store 16 32x4 vectors, assuming stride == 16.
+static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
+  vst1q_s32(a, b[0]);
+  a += 16;
+  vst1q_s32(a, b[1]);
+  a += 16;
+  vst1q_s32(a, b[2]);
+  a += 16;
+  vst1q_s32(a, b[3]);
+  a += 16;
+  vst1q_s32(a, b[4]);
+  a += 16;
+  vst1q_s32(a, b[5]);
+  a += 16;
+  vst1q_s32(a, b[6]);
+  a += 16;
+  vst1q_s32(a, b[7]);
+  a += 16;
+  vst1q_s32(a, b[8]);
+  a += 16;
+  vst1q_s32(a, b[9]);
+  a += 16;
+  vst1q_s32(a, b[10]);
+  a += 16;
+  vst1q_s32(a, b[11]);
+  a += 16;
+  vst1q_s32(a, b[12]);
+  a += 16;
+  vst1q_s32(a, b[13]);
+  a += 16;
+  vst1q_s32(a, b[14]);
+  a += 16;
+  vst1q_s32(a, b[15]);
+}
+
+// Main body of fdct16x16.
+static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
+                                      int32x4_t *right /* [16] */) {
+  int32x4_t sl[8];
+  int32x4_t sr[8];
+  int32x4_t xl[4];
+  int32x4_t xr[4];
+  int32x4_t inl[8];
+  int32x4_t inr[8];
+  int32x4_t stepl[8];
+  int32x4_t stepr[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // Copy values 8-15 as we're storing in-place
+  inl[0] = left[8];
+  inr[0] = right[8];
+  inl[1] = left[9];
+  inr[1] = right[9];
+  inl[2] = left[10];
+  inr[2] = right[10];
+  inl[3] = left[11];
+  inr[3] = right[11];
+  inl[4] = left[12];
+  inr[4] = right[12];
+  inl[5] = left[13];
+  inr[5] = right[13];
+  inl[6] = left[14];
+  inr[6] = right[14];
+  inl[7] = left[15];
+  inr[7] = right[15];
+
+  // fdct4(step, step);
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[8]);
+  highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
+                                 &right[8]);
+  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
+                                 &left[4], &left[12]);
+  highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
+                                 &right[4], &right[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &sl[6], &sl[5]);
+  highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &sr[6], &sr[5]);
+
+  //  Stage 3
+  xl[0] = vaddq_s32(sl[4], sl[5]);
+  xr[0] = vaddq_s32(sr[4], sr[5]);
+  xl[1] = vsubq_s32(sl[4], sl[5]);
+  xr[1] = vsubq_s32(sr[4], sr[5]);
+  xl[2] = vsubq_s32(sl[7], sl[6]);
+  xr[2] = vsubq_s32(sr[7], sr[6]);
+  xl[3] = vaddq_s32(sl[7], sl[6]);
+  xr[3] = vaddq_s32(sr[7], sr[6]);
+
+  // Stage 4
+  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
+                                 &left[2], &left[14]);
+  highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
+                                 &right[2], &right[14]);
+  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
+                                 &left[10], &left[6]);
+  highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
+                                 &right[10], &right[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  highbd_butterfly_one_coeff_s32(inl[5], inl[2], cospi_16_64, &sl[5], &sl[2]);
+  highbd_butterfly_one_coeff_s32(inr[5], inr[2], cospi_16_64, &sr[5], &sr[2]);
+  highbd_butterfly_one_coeff_s32(inl[4], inl[3], cospi_16_64, &sl[4], &sl[3]);
+  highbd_butterfly_one_coeff_s32(inr[4], inr[3], cospi_16_64, &sr[4], &sr[3]);
+
+  // step 3
+  sl[0] = vaddq_s32(inl[0], sl[3]);
+  sr[0] = vaddq_s32(inr[0], sr[3]);
+  sl[1] = vaddq_s32(inl[1], sl[2]);
+  sr[1] = vaddq_s32(inr[1], sr[2]);
+  xl[0] = vsubq_s32(inl[1], sl[2]);
+  xr[0] = vsubq_s32(inr[1], sr[2]);
+  xl[1] = vsubq_s32(inl[0], sl[3]);
+  xr[1] = vsubq_s32(inr[0], sr[3]);
+  xl[2] = vsubq_s32(inl[7], sl[4]);
+  xr[2] = vsubq_s32(inr[7], sr[4]);
+  xl[3] = vsubq_s32(inl[6], sl[5]);
+  xr[3] = vsubq_s32(inr[6], sr[5]);
+  sl[6] = vaddq_s32(inl[6], sl[5]);
+  sr[6] = vaddq_s32(inr[6], sr[5]);
+  sl[7] = vaddq_s32(inl[7], sl[4]);
+  sr[7] = vaddq_s32(inr[7], sr[4]);
+
+  // step 4
+  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
+  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
+  highbd_butterfly_two_coeff_s32(sl[6], sl[1], cospi_8_64, cospi_24_64, &sl[6],
+                                 &sl[1]);
+  highbd_butterfly_two_coeff_s32(sr[6], sr[1], cospi_8_64, cospi_24_64, &sr[6],
+                                 &sr[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
+  highbd_butterfly_two_coeff_s32(xl[0], xl[3], cospi_24_64, cospi_8_64, &sl[2],
+                                 &sl[5]);
+  highbd_butterfly_two_coeff_s32(xr[0], xr[3], cospi_24_64, cospi_8_64, &sr[2],
+                                 &sr[5]);
+
+  // step 5
+  stepl[0] = vaddq_s32(sl[0], sl[1]);
+  stepr[0] = vaddq_s32(sr[0], sr[1]);
+  stepl[1] = vsubq_s32(sl[0], sl[1]);
+  stepr[1] = vsubq_s32(sr[0], sr[1]);
+  stepl[2] = vaddq_s32(xl[1], sl[2]);
+  stepr[2] = vaddq_s32(xr[1], sr[2]);
+  stepl[3] = vsubq_s32(xl[1], sl[2]);
+  stepr[3] = vsubq_s32(xr[1], sr[2]);
+  stepl[4] = vsubq_s32(xl[2], sl[5]);
+  stepr[4] = vsubq_s32(xr[2], sr[5]);
+  stepl[5] = vaddq_s32(xl[2], sl[5]);
+  stepr[5] = vaddq_s32(xr[2], sr[5]);
+  stepl[6] = vsubq_s32(sl[7], sl[6]);
+  stepr[6] = vsubq_s32(sr[7], sr[6]);
+  stepl[7] = vaddq_s32(sl[7], sl[6]);
+  stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+  // step 6
+  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
+  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
+  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
+  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
+  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
+  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
+  // cospi_22_64) out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] *
+  // cospi_26_64) out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] *
+  // cospi_6_64)
+  highbd_butterfly_two_coeff_s32(stepl[7], stepl[0], cospi_2_64, cospi_30_64,
+                                 &left[1], &left[15]);
+  highbd_butterfly_two_coeff_s32(stepr[7], stepr[0], cospi_2_64, cospi_30_64,
+                                 &right[1], &right[15]);
+  highbd_butterfly_two_coeff_s32(stepl[6], stepl[1], cospi_18_64, cospi_14_64,
+                                 &left[9], &left[7]);
+  highbd_butterfly_two_coeff_s32(stepr[6], stepr[1], cospi_18_64, cospi_14_64,
+                                 &right[9], &right[7]);
+  highbd_butterfly_two_coeff_s32(stepl[5], stepl[2], cospi_10_64, cospi_22_64,
+                                 &left[5], &left[11]);
+  highbd_butterfly_two_coeff_s32(stepr[5], stepr[2], cospi_10_64, cospi_22_64,
+                                 &right[5], &right[11]);
+  highbd_butterfly_two_coeff_s32(stepl[4], stepl[3], cospi_26_64, cospi_6_64,
+                                 &left[13], &left[3]);
+  highbd_butterfly_two_coeff_s32(stepr[4], stepr[3], cospi_26_64, cospi_6_64,
+                                 &right[13], &right[3]);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ca904aa504..c82be0a6cc 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -562,7 +562,7 @@ ()
   $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
 
   add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct16x16 sse2/;
+  specialize qw/vpx_highbd_fdct16x16 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct16x16_1 neon/;

From 124e57be95dbb05ee6b5b1554e345b97a4ddf34e Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Thu, 13 Oct 2022 15:19:46 +0000
Subject: [PATCH 456/926] [NEON] fix clang compile warnings

Change-Id: Ib7ce7a774ec89ba51169ea64d24c878109ef07d1
---
 vpx_dsp/arm/fdct_neon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index c100e709d5..ce669061d2 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -358,7 +358,7 @@ static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) {
 
 static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a,
                                                   const int32x4_t b,
-                                                  const tran_high_t c,
+                                                  const tran_coef_t c,
                                                   int32x4_t *add,
                                                   int32x4_t *sub) {
   const int32x2_t a_lo = vget_low_s32(a);

From 4007a057fcb60f57f7b60ecc888c4d4b9043b2db Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 12 Oct 2022 00:10:47 -0700
Subject: [PATCH 457/926] Fix to VP8 external RC for dynamic update of layers

On change/update of rc_cfg: when number of temporal
layers change call vp8_reset_temporal_layer_change(),
which in turn will call vp8_init_temporal_layer_context()
only for the new layers.

Bug:b/249644737

Change-Id: Ib20d746c7eacd10b78806ca6a5362c750d9ca0b3
---
 vp8/encoder/onyx_if.c   |  6 +++---
 vp8/encoder/onyx_int.h  |  2 ++
 vp8/vp8_ratectrl_rtc.cc | 29 +++++++++++++++++++++++------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 94fb6e256e..4bbeadef01 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -328,8 +328,8 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
 // for any "new" layers. For "existing" layers, let them inherit the parameters
 // from the previous layer state (at the same layer #). In future we may want
 // to better map the previous layer state(s) to the "new" ones.
-static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int prev_num_layers) {
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int prev_num_layers) {
   int i;
   double prev_layer_framerate = 0;
   const int curr_num_layers = cpi->oxcf.number_of_layers;
@@ -1643,7 +1643,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
       cpi->temporal_layer_id = 0;
     }
     cpi->temporal_pattern_counter = 0;
-    reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
+    vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
   }
 
   if (!cpi->initial_width) {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 7951f0a77e..46a17913ad 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -713,6 +713,8 @@ void vp8_initialize_enc(void);
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int prev_num_layers);
 void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
                                      const int layer,
                                      double prev_layer_framerate);
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index 2f23c5b1d9..46621546dd 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -92,6 +92,7 @@ void VP8RateControlRTC::UpdateRateControl(
     const VP8RateControlRtcConfig &rc_cfg) {
   VP8_COMMON *cm = &cpi_->common;
   VP8_CONFIG *oxcf = &cpi_->oxcf;
+  const unsigned int prev_number_of_layers = oxcf->number_of_layers;
   vpx_clear_system_state();
   cm->Width = rc_cfg.width;
   cm->Height = rc_cfg.height;
@@ -124,17 +125,33 @@ void VP8RateControlRTC::UpdateRateControl(
         static_cast<int>(cpi_->output_framerate);
   }
 
-  if (oxcf->number_of_layers > 1) {
+  if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) {
     memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
            sizeof(rc_cfg.layer_target_bitrate));
     memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator,
            sizeof(rc_cfg.ts_rate_decimator));
-    oxcf->periodicity = 2;
+    if (cm->current_video_frame == 0) {
+      double prev_layer_framerate = 0;
+      for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
+        vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
+        prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+      }
+    } else if (oxcf->number_of_layers != prev_number_of_layers) {
+      // The number of temporal layers has changed, so reset/initialize the
+      // temporal layer context for the new layer configuration: this means
+      // calling vp8_reset_temporal_layer_change() below.
+
+      // Start at the base of the pattern cycle, so set the layer id to 0 and
+      // reset the temporal pattern counter.
+      // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls
+      // the layer_id) so remove.
+      if (cpi_->temporal_layer_id > 0) {
+        cpi_->temporal_layer_id = 0;
+      }
+      cpi_->temporal_pattern_counter = 0;
 
-    double prev_layer_framerate = 0;
-    for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
-      vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
-      prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+      vp8_reset_temporal_layer_change(cpi_, oxcf,
+                                      static_cast<int>(prev_number_of_layers));
     }
   }
 

From 828d05d4a497e7f5d635a8e5bf35fca790759b13 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@gmail.com>
Date: Sat, 1 Oct 2022 11:47:05 +0900
Subject: [PATCH 458/926] vp9 quantize: rewrite ssse3 in intrinsics

Change-Id: I3177251a5935453a23a23c39ea5f6fd41254775e
---
 test/vp9_quantize_test.cc                     |  11 -
 vp9/common/vp9_rtcd_defs.pl                   |   4 +-
 vp9/encoder/x86/vp9_quantize_ssse3.c          | 253 ++++++++++++++++++
 vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 174 ------------
 vp9/vp9cx.mk                                  |   5 +-
 5 files changed, 256 insertions(+), 191 deletions(-)
 create mode 100644 vp9/encoder/x86/vp9_quantize_ssse3.c
 delete mode 100644 vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 7bb0bee512..587cec6923 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -539,7 +539,6 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-#if VPX_ARCH_X86_64
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
@@ -553,16 +552,6 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true)));
-#else
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
-
-#endif  // VPX_ARCH_X86_64
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 4290c2380e..871e4d0a35 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -129,10 +129,10 @@ ()
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 
 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
 
 add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32 neon avx2 vsx/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_block_error avx2 sse2/;
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c
new file mode 100644
index 0000000000..d35004e370
--- /dev/null
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -0,0 +1,253 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const int16_t *round_ptr, const int16_t *quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                           const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+
+  (void)scan;
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one_s16 = _mm_set1_epi16(1);
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+
+  (void)scan;
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  // The 32x32 halves round.
+  round = _mm_add_epi16(round, one_s16);
+  round = _mm_srli_epi16(round, 1);
+
+  // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so
+  // upshift quant to account for this.
+  quant = _mm_slli_epi16(quant, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  // Get the abs value of qcoeff again so we can use shifts for division.
+  qcoeff0 = _mm_abs_epi16(qcoeff0);
+  qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  // Divide by 2.
+  qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+  qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 2);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      // Get the abs value of qcoeff again so we can use shifts for division.
+      qcoeff0 = _mm_abs_epi16(qcoeff0);
+      qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      // Divide by 2.
+      qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+      qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    if (nzflag) {
+      const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    }
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
deleted file mode 100644
index ae43a90f8b..0000000000
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,174 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \
-                                qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mov                             r2, dequantmp
-%ifidn %1, fp_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m1, m5
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  mov                             r3, qcoeffmp
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, fp_32x32
-  psllw                           m2, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq
-  lea                            r5q, [r5q+ncoeffq*2]
-  INCREMENT_ELEMENTS_TRAN_LOW    r3q, ncoeffq
-  INCREMENT_ELEMENTS_TRAN_LOW    r4q, ncoeffq
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
-  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  STORE_TRAN_LOW  8, r3q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
-%ifidn %1, fp_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-  psrlw                           m0, m3, 2
-%else
-  psrlw                           m0, m3, 1
-%endif
-  STORE_TRAN_LOW  8, r4q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
-  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-
-  pcmpgtw                         m7, m6,  m0
-  pcmpgtw                        m12, m11, m0
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-
-  or                              r6, r2
-  jz .skip_iter
-
-  pcmpeqw                         m7, m7
-
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  STORE_TRAN_LOW 14, r3q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
-%ifidn %1, fp_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
-  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  STORE_TRAN_LOW 14, r4q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-  jmp .accumulate_eob
-.skip_iter:
-  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq
-  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8
-  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq
-  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                           [r2], r6w
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FP fp, 7
-QUANTIZE_FP fp_32x32, 7
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c9afd9a347..9072628f23 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -111,6 +111,7 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -122,10 +123,6 @@ endif
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 
-ifeq ($(VPX_ARCH_X86_64),yes)
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
-endif
-
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c

From e8fc52ada46d2375c2dce6ac9e9d0bf27f622866 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@gmail.com>
Date: Mon, 17 Oct 2022 16:22:23 +0900
Subject: [PATCH 459/926] quantize: consolidate sse2 conditionals

Change-Id: I43de579e30f2967b97064063e29676e0af1a498f
---
 vp9/encoder/x86/vp9_quantize_sse2.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 272e5fb079..c877234436 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -89,6 +89,7 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
              _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
     if (nzflag) {
+      __m128i eob0;
       qcoeff0 = _mm_adds_epi16(qcoeff0, round);
       qcoeff1 = _mm_adds_epi16(qcoeff1, round);
       qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
@@ -101,11 +102,14 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       store_tran_low(qcoeff0, qcoeff_ptr + index);
       store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
-      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-      store_tran_low(coeff0, dqcoeff_ptr + index);
-      store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
     } else {
       store_zero_tran_low(qcoeff_ptr + index);
       store_zero_tran_low(qcoeff_ptr + index + 8);
@@ -114,10 +118,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       store_zero_tran_low(dqcoeff_ptr + index + 8);
     }
 
-    if (nzflag) {
-      const __m128i eob0 = scan_for_eob(&coeff0, &coeff1, iscan, index, zero);
-      eob = _mm_max_epi16(eob, eob0);
-    }
     index += 16;
   }
 

From 5245f6e9cb7e6bb68ab45fe4d8b00bc9b16857e1 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 17 Oct 2022 22:36:25 -0700
Subject: [PATCH 460/926] Fix to VP8 external RC for buffer levels

On a dynamic change of temporal layers:
starting/maimum/optimal were being set twice,
causing incorrect large values.

Bug: b/253927937
Change-Id: I204e885cff92530336a9ed9a4363c486c5bf80ae
---
 vp8/vp8_ratectrl_rtc.cc | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index 46621546dd..f3f42529db 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -163,20 +163,24 @@ void VP8RateControlRTC::UpdateRateControl(
   cm->MBs = cm->mb_rows * cm->mb_cols;
   cm->mode_info_stride = cm->mb_cols + 1;
 
-  oxcf->starting_buffer_level =
-      rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
-  /* Set or reset optimal and maximum buffer levels. */
-  if (oxcf->optimal_buffer_level == 0) {
-    oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
-  } else {
-    oxcf->optimal_buffer_level =
-        rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000);
-  }
-  if (oxcf->maximum_buffer_size == 0) {
-    oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
-  } else {
-    oxcf->maximum_buffer_size =
-        rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+  // For temporal layers: starting/maximum/optimal_buffer_level is already set
+  // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change().
+  if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) {
+    oxcf->starting_buffer_level =
+        rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
+    /* Set or reset optimal and maximum buffer levels. */
+    if (oxcf->optimal_buffer_level == 0) {
+      oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
+    } else {
+      oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level,
+                                           oxcf->target_bandwidth, 1000);
+    }
+    if (oxcf->maximum_buffer_size == 0) {
+      oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
+    } else {
+      oxcf->maximum_buffer_size =
+          rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+    }
   }
 
   if (cpi_->bits_off_target > oxcf->maximum_buffer_size) {

From 228d8a4fed449cbfea678be4f0baf67113b82549 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 24 Oct 2022 15:24:51 -0700
Subject: [PATCH 461/926] highbd_sad_avx2: normalize function param names

(src|ref)8_ptr -> (src|ref)_ptr. aligns the names with the rtcd header;
clears some clang-tidy warnings

Change-Id: Id1aa29da8c0fa5860b46ac902f5b2620c0d3ff54
---
 vpx_dsp/x86/highbd_sad_avx2.c | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 7533ccfddb..231b67f809 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -52,10 +52,10 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
 
 #define HIGHBD_SAD64XN(n)                                                    \
   unsigned int vpx_highbd_sad64x##n##_avx2(                                  \
-      const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr,      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
       int ref_stride) {                                                      \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);                     \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
     __m256i sums_32 = _mm256_setzero_si256();                                \
     int i;                                                                   \
                                                                              \
@@ -109,10 +109,10 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
 
 #define HIGHBD_SAD32XN(n)                                                    \
   unsigned int vpx_highbd_sad32x##n##_avx2(                                  \
-      const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr,      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
       int ref_stride) {                                                      \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);                     \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
     __m256i sums_32 = _mm256_setzero_si256();                                \
     int i;                                                                   \
                                                                              \
@@ -167,10 +167,10 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
   }
 }
 
-unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src8_ptr, int src_stride,
-                                      const uint8_t *ref8_ptr, int ref_stride) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
   __m256i sums_32 = _mm256_setzero_si256();
   int i;
 
@@ -192,10 +192,10 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src8_ptr, int src_stride,
   return calc_final(sums_32);
 }
 
-unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src8_ptr, int src_stride,
-                                      const uint8_t *ref8_ptr, int ref_stride) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
   __m256i sums_16 = _mm256_setzero_si256();
 
   highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
@@ -208,10 +208,10 @@ unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src8_ptr, int src_stride,
   }
 }
 
-unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride,
-                                     const uint8_t *ref8_ptr, int ref_stride) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
   __m256i sums_16 = _mm256_setzero_si256();
 
   highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);

From d667193e6a36629746faa1e29e8fb17573eea893 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 24 Oct 2022 15:28:47 -0700
Subject: [PATCH 462/926] vp9_highbd_quantize_fp*_neon: normalize fn param name

count -> n_coeffs. aligns the name with the rtcd header; clears a
clang-tidy warning

Change-Id: I36545ff479df92b117c95e494f16002e6990f433
---
 vp9/encoder/arm/neon/vp9_quantize_neon.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index b9bd1eba31..c2b55fcbaa 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -257,7 +257,7 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
   return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
 }
 
-void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                  const int16_t *round_ptr,
                                  const int16_t *quant_ptr,
                                  tran_low_t *qcoeff_ptr,
@@ -294,7 +294,7 @@ void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
   v_eobmax =
       get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
 
-  count -= 8;
+  n_coeffs -= 8;
   do {
     coeff_ptr += 8;
     qcoeff_ptr += 8;
@@ -308,8 +308,8 @@ void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
     // Find the max lane eob for 8 coeffs.
     v_eobmax =
         get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
-    count -= 8;
-  } while (count);
+    n_coeffs -= 8;
+  } while (n_coeffs);
 
   *eob_ptr = get_max_eob(v_eobmax);
 }
@@ -349,7 +349,7 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vp9_highbd_quantize_fp_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *round_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
     const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
     const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
     const int16_t *iscan) {
@@ -385,7 +385,7 @@ void vp9_highbd_quantize_fp_32x32_neon(
   v_eobmax =
       get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
 
-  count -= 8;
+  n_coeffs -= 8;
   do {
     coeff_ptr += 8;
     qcoeff_ptr += 8;
@@ -400,8 +400,8 @@ void vp9_highbd_quantize_fp_32x32_neon(
     // Find the max lane eob for 8 coeffs.
     v_eobmax =
         get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
-    count -= 8;
-  } while (count);
+    n_coeffs -= 8;
+  } while (n_coeffs);
 
   *eob_ptr = get_max_eob(v_eobmax);
 }

From ee12bc390dca10e53e7dbb16589fd183fc3d7792 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 24 Oct 2022 15:37:26 -0700
Subject: [PATCH 463/926] SAD*Test: mark virtual Run() as overridden

this comes from AbstractBench

Change-Id: Ie0b5a26a68bfbffd80f132125d15a1bdfc990c22
---
 test/sad_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 29e3f57f5e..0896c77f12 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -311,7 +311,7 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void Run() {
+  void Run() override {
     params_.func(source_data_, source_stride_, reference_data_,
                  reference_stride_);
   }
@@ -339,7 +339,7 @@ class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void Run() {
+  void Run() override {
     params_.func(source_data_, source_stride_, reference_data_,
                  reference_stride_, second_pred_);
   }

From 4b659f3c345b81db67bddcfd831e33abc40c021e Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Wed, 26 Oct 2022 17:14:21 +0900
Subject: [PATCH 464/926] mailmap: add johann@duck.com

Change-Id: I3b48951e69ba1f4a9fafdbb81fac48f79587a342
---
 .mailmap | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index 376ca83ae3..3593de4b9c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -21,8 +21,8 @@ Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann <johann@duck.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Marco Paniconi <marpan@google.com>

From 9e1bdd12c79008a1fa8f1b9ba63194bdcd020fd7 Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Thu, 27 Oct 2022 08:49:37 +0900
Subject: [PATCH 465/926] rtcd: allow disabling neon on armv8

Change-Id: Idef943775456eb95b46be5c92c114c1d215f38d7
---
 build/make/rtcd.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 9c97268426..f4edeaad51 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -488,7 +488,8 @@ ()
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
   @ALL_ARCHS = filter(qw/neon/);
-  &require("neon");
+  @REQUIRES = filter(qw/neon/);
+  &require(@REQUIRES);
   arm;
 } elsif ($opts{arch} =~ /^ppc/ ) {
   @ALL_ARCHS = filter(qw/vsx/);

From ebf22e2e8db9e863a2cbc24081381e24fe306882 Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Thu, 27 Oct 2022 11:40:19 +0900
Subject: [PATCH 466/926] MacOS 13 is darwin22

Bug: webm:1783
Change-Id: I97d94ab8c8aebe13aedb58e280dc37474814ad5d
---
 build/make/configure.sh | 4 ++--
 configure               | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 581042e38e..e9b7fa9c1c 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -791,7 +791,7 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
         ;;
-      *darwin2[0-1]*)
+      *darwin2[0-2]*)
         tgt_isa=`uname -m`
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
         ;;
@@ -940,7 +940,7 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.15"
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
-    *-darwin2[0-1]-*)
+    *-darwin2[0-2]-*)
       add_cflags  "-arch ${toolchain%%-*}"
       add_ldflags "-arch ${toolchain%%-*}"
       ;;
diff --git a/configure b/configure
index 1b850b5e04..bf92e1ad1f 100755
--- a/configure
+++ b/configure
@@ -101,6 +101,7 @@ all_platforms="${all_platforms} arm64-android-gcc"
 all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} arm64-darwin20-gcc"
 all_platforms="${all_platforms} arm64-darwin21-gcc"
+all_platforms="${all_platforms} arm64-darwin22-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
@@ -157,6 +158,7 @@ all_platforms="${all_platforms} x86_64-darwin18-gcc"
 all_platforms="${all_platforms} x86_64-darwin19-gcc"
 all_platforms="${all_platforms} x86_64-darwin20-gcc"
 all_platforms="${all_platforms} x86_64-darwin21-gcc"
+all_platforms="${all_platforms} x86_64-darwin22-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"

From 3121783fec60d0ce4551d472d1acbd1f1a8253be Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 26 Oct 2022 21:37:31 +0000
Subject: [PATCH 467/926] [NEON] Optimize and homogenize Butterfly DCT
 functions

Provide a set of commonly used Butterfly DCT functions for use in
DCT 4x4, 8x8, 16x16, 32x32 functions. These are provided in various
forms, using vqrdmulh_s16/vqrdmulh_s32 for _fast variants, which
unfortunately are only usable in pass1 of most DCTs, as they do not
provide the necessary precision in pass2.
This gave a performance gain ranging from 5% to 15% in 16x16 case.
Also, for 32x32, the loads were rearranged, along with the butterfly
optimizations, this gave 10% gain in 32x32_rd function.
This refactoring was necessary to allow easier porting of highbd
32x32 functions -follows this patchset.

Change-Id: I6282e640b95a95938faff76c3b2bace3dc298bc3
---
 vp9/encoder/arm/neon/vp9_dct_neon.c |   17 +-
 vpx_dsp/arm/fdct16x16_neon.c        |   18 +-
 vpx_dsp/arm/fdct16x16_neon.h        |  297 +++----
 vpx_dsp/arm/fdct32x32_neon.c        | 1158 +--------------------------
 vpx_dsp/arm/fdct32x32_neon.h        | 1105 +++++++++++++++++++++++++
 vpx_dsp/arm/fdct4x4_neon.c          |   13 +-
 vpx_dsp/arm/fdct4x4_neon.h          |  105 +++
 vpx_dsp/arm/fdct8x8_neon.c          |   47 +-
 vpx_dsp/arm/fdct8x8_neon.h          |  381 +++++++++
 vpx_dsp/arm/fdct_neon.h             |  757 +++++++----------
 vpx_dsp/arm/transpose_neon.h        |   45 ++
 11 files changed, 2112 insertions(+), 1831 deletions(-)
 create mode 100644 vpx_dsp/arm/fdct32x32_neon.h
 create mode 100644 vpx_dsp/arm/fdct4x4_neon.h
 create mode 100644 vpx_dsp/arm/fdct8x8_neon.h

diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index a07a1608d7..b8286a8dd5 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -18,6 +18,8 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
 
 static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
                                    int stride) {
@@ -130,12 +132,14 @@ void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
     case ADST_DCT:
       load_buffer_4x4(input, in, stride);
       fadst4x4_neon(in);
-      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
       write_buffer_4x4(output, in);
       break;
     case DCT_ADST:
       load_buffer_4x4(input, in, stride);
-      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
       fadst4x4_neon(in);
       write_buffer_4x4(output, in);
       break;
@@ -488,13 +492,15 @@ void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
     case ADST_DCT:
       load_buffer_8x8(input, in, stride);
       fadst8x8_neon(in);
-      vpx_fdct8x8_pass1_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case DCT_ADST:
       load_buffer_8x8(input, in, stride);
-      vpx_fdct8x8_pass1_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
       fadst8x8_neon(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
@@ -559,7 +565,8 @@ static void fdct16_8col(int16x8_t *in) {
   i[6] = vaddq_s16(in[6], in[9]);
   i[7] = vaddq_s16(in[7], in[8]);
 
-  vpx_fdct8x8_pass1_neon(i);
+  // pass1 variant is not accurate enough
+  vpx_fdct8x8_pass2_neon(i);
   transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
 
   // step 2
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index d0c07d429a..a458ecaa41 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -37,20 +37,21 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   // Left half.
   load_cross(input, stride, temp0);
   scale_input(temp0, temp1);
-  vpx_fdct16x16_body(temp1, temp0);
+  vpx_fdct8x16_body(temp1, temp0);
 
   // Right half.
   load_cross(input + 8, stride, temp1);
   scale_input(temp1, temp2);
-  vpx_fdct16x16_body(temp2, temp1);
+  vpx_fdct8x16_body(temp2, temp1);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
+
   transpose_s16_8x8_new(&temp0[0], &temp2[0]);
   transpose_s16_8x8_new(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3);
-  vpx_fdct16x16_body(temp3, temp2);
+  vpx_fdct8x16_body(temp3, temp2);
   transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
                     &temp2[5], &temp2[6], &temp2[7]);
   transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -62,11 +63,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
   transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
   cross_input(temp1, temp0);
-  vpx_fdct16x16_body(temp0, temp1);
+  vpx_fdct8x16_body(temp0, temp1);
   transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
                     &temp1[5], &temp1[6], &temp1[7]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
@@ -86,12 +88,12 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
   // Left half.
   load_cross(input, stride, temp0);
   highbd_scale_input(temp0, left1, right1);
-  vpx_highbd_fdct16x16_body(left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
 
   // right half.
   load_cross(input + 8, stride, temp0);
   highbd_scale_input(temp0, left2, right2);
-  vpx_highbd_fdct16x16_body(left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
@@ -103,14 +105,14 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
 
   highbd_partial_round_shift(left3, right3);
   highbd_cross_input(left3, right3, left1, right1);
-  vpx_highbd_fdct16x16_body(left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
 
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
 
   highbd_partial_round_shift(left4, right4);
   highbd_cross_input(left4, right4, left2, right2);
-  vpx_highbd_fdct16x16_body(left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
 
   transpose_s32_8x8_2(left1, right1, left3, right3);
   transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index d99870903b..43d820b6bd 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -160,8 +160,8 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
 }
 
 // Main body of fdct16x16.
-static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
-                               int16x8_t *out /*[16]*/) {
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+                              int16x8_t *out /*[16]*/) {
   int16x8_t s[8];
   int16x8_t x[4];
   int16x8_t step[8];
@@ -186,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
 
   // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
   // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[8]);
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
   // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
 
   //  Stage 2
   // Re-using source s5/s6
   // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
   // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
 
   //  Stage 3
   x[0] = vaddq_s16(s[4], s[5]);
@@ -204,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
   x[3] = vaddq_s16(s[7], s[6]);
 
   // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
 
   // step 2
   // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -221,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
   // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
   // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
   // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
 
   // step 3
   s[0] = vaddq_s16(in[8], s[3]);
@@ -235,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
   s[7] = vaddq_s16(in[15], s[4]);
 
   // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
 
   // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
 
   // step 5
   step[0] = vaddq_s16(s[0], s[1]);
@@ -254,22 +257,23 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
   step[7] = vaddq_s16(s[7], s[6]);
 
   // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
-  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
                       &out[7]);
-  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
                       &out[15]);
-  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
                       &out[3]);
-  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
                       &out[11]);
 }
 
@@ -279,36 +283,37 @@ static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
                                       int32x4_t *left /*[16]*/,
                                       int32x4_t *right /* [16] */) {
   left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
-  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
   left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
-  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
   left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
-  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
   left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
-  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
   left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
-  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
   left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
-  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
   left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
-  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
   left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
-  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
   left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
-  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
   left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
-  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
   left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
-  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
   left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
-  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
   left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
-  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
   left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
-  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
   left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
-  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
   left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
   right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
 }
 
@@ -357,81 +362,38 @@ static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
                                               int32x4_t *right /* [16] */) {
   const int32x4_t one = vdupq_n_s32(1);
   left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
-  right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
   left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
-  right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
   left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
-  right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
   left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
-  right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
   left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
-  right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
   left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
-  right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
   left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
-  right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
   left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
-  right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
   left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
-  right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
   left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
-  right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
   left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
-  right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
   left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
-  right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
   left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
-  right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
   left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
-  right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
   left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
-  right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
   left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
-  right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
-}
 
-static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
-                                       int32x4_t *right /*[8]*/,
-                                       int32x4_t *out_left /*[8]*/,
-                                       int32x4_t *out_right /*[8]*/) {
-  int32x4x2_t out[8];
-
-  out[0].val[0] = left[0];
-  out[0].val[1] = right[0];
-  out[1].val[0] = left[1];
-  out[1].val[1] = right[1];
-  out[2].val[0] = left[2];
-  out[2].val[1] = right[2];
-  out[3].val[0] = left[3];
-  out[3].val[1] = right[3];
-  out[4].val[0] = left[4];
-  out[4].val[1] = right[4];
-  out[5].val[0] = left[5];
-  out[5].val[1] = right[5];
-  out[6].val[0] = left[6];
-  out[6].val[1] = right[6];
-  out[7].val[0] = left[7];
-  out[7].val[1] = right[7];
-
-  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
-                    &out[6], &out[7]);
-
-  out_left[0] = out[0].val[0];
-  out_left[1] = out[1].val[0];
-  out_left[2] = out[2].val[0];
-  out_left[3] = out[3].val[0];
-  out_left[4] = out[4].val[0];
-  out_left[5] = out[5].val[0];
-  out_left[6] = out[6].val[0];
-  out_left[7] = out[7].val[0];
-  out_right[0] = out[0].val[1];
-  out_right[1] = out[1].val[1];
-  out_right[2] = out[2].val[1];
-  out_right[3] = out[3].val[1];
-  out_right[4] = out[4].val[1];
-  out_right[5] = out[5].val[1];
-  out_right[6] = out[6].val[1];
-  out_right[7] = out[7].val[1];
+  right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+  right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+  right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+  right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+  right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+  right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+  right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+  right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+  right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+  right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+  right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+  right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+  right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+  right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+  right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+  right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
 }
 
 // Store 16 32x4 vectors, assuming stride == 16.
@@ -469,9 +431,9 @@ static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
   vst1q_s32(a, b[15]);
 }
 
-// Main body of fdct16x16.
-static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
-                                      int32x4_t *right /* [16] */) {
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+                                     int32x4_t *right /* [16] */) {
   int32x4_t sl[8];
   int32x4_t sr[8];
   int32x4_t xl[4];
@@ -531,22 +493,21 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
 
   // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
   // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[8]);
-  highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
-                                 &right[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[8], &right[8]);
+
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
   // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
-                                 &left[4], &left[12]);
-  highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
-                                 &right[4], &right[12]);
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[4], &right[4],
+                                     &left[12], &right[12]);
 
   //  Stage 2
   // Re-using source s5/s6
   // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
   // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &sl[6], &sl[5]);
-  highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &sr[6], &sr[5]);
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+                               &sr[6], &sl[5], &sr[5]);
 
   //  Stage 3
   xl[0] = vaddq_s32(sl[4], sl[5]);
@@ -559,18 +520,16 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
   xr[3] = vaddq_s32(sr[7], sr[6]);
 
   // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
-                                 &left[2], &left[14]);
-  highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
-                                 &right[2], &right[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
-                                 &left[10], &left[6]);
-  highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
-                                 &right[10], &right[6]);
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[2], &right[2],
+                                     &left[14], &right[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[10], &right[10],
+                                     &left[6], &right[6]);
 
   // step 2
   // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -582,10 +541,10 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
   // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
   // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
   // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  highbd_butterfly_one_coeff_s32(inl[5], inl[2], cospi_16_64, &sl[5], &sl[2]);
-  highbd_butterfly_one_coeff_s32(inr[5], inr[2], cospi_16_64, &sr[5], &sr[2]);
-  highbd_butterfly_one_coeff_s32(inl[4], inl[3], cospi_16_64, &sl[4], &sl[3]);
-  highbd_butterfly_one_coeff_s32(inr[4], inr[3], cospi_16_64, &sr[4], &sr[3]);
+  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+                               &sl[5], &sr[5], &sl[2], &sr[2]);
+  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+                               &sl[4], &sr[4], &sl[3], &sr[3]);
 
   // step 3
   sl[0] = vaddq_s32(inl[0], sl[3]);
@@ -606,19 +565,18 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
   sr[7] = vaddq_s32(inr[7], sr[4]);
 
   // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  highbd_butterfly_two_coeff_s32(sl[6], sl[1], cospi_8_64, cospi_24_64, &sl[6],
-                                 &sl[1]);
-  highbd_butterfly_two_coeff_s32(sr[6], sr[1], cospi_8_64, cospi_24_64, &sr[6],
-                                 &sr[1]);
-
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
+                                     &sr[1]);
   // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  highbd_butterfly_two_coeff_s32(xl[0], xl[3], cospi_24_64, cospi_8_64, &sl[2],
-                                 &sl[5]);
-  highbd_butterfly_two_coeff_s32(xr[0], xr[3], cospi_24_64, cospi_8_64, &sr[2],
-                                 &sr[5]);
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
+                                     &sr[5]);
 
   // step 5
   stepl[0] = vaddq_s32(sl[0], sl[1]);
@@ -639,31 +597,26 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
   stepr[7] = vaddq_s32(sr[7], sr[6]);
 
   // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64) out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] *
-  // cospi_26_64) out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] *
-  // cospi_6_64)
-  highbd_butterfly_two_coeff_s32(stepl[7], stepl[0], cospi_2_64, cospi_30_64,
-                                 &left[1], &left[15]);
-  highbd_butterfly_two_coeff_s32(stepr[7], stepr[0], cospi_2_64, cospi_30_64,
-                                 &right[1], &right[15]);
-  highbd_butterfly_two_coeff_s32(stepl[6], stepl[1], cospi_18_64, cospi_14_64,
-                                 &left[9], &left[7]);
-  highbd_butterfly_two_coeff_s32(stepr[6], stepr[1], cospi_18_64, cospi_14_64,
-                                 &right[9], &right[7]);
-  highbd_butterfly_two_coeff_s32(stepl[5], stepl[2], cospi_10_64, cospi_22_64,
-                                 &left[5], &left[11]);
-  highbd_butterfly_two_coeff_s32(stepr[5], stepr[2], cospi_10_64, cospi_22_64,
-                                 &right[5], &right[11]);
-  highbd_butterfly_two_coeff_s32(stepl[4], stepl[3], cospi_26_64, cospi_6_64,
-                                 &left[13], &left[3]);
-  highbd_butterfly_two_coeff_s32(stepr[4], stepr[3], cospi_26_64, cospi_6_64,
-                                 &right[13], &right[3]);
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
 }
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index 51d81bd085..e2bf167604 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -16,6 +16,7 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
 
 // Most gcc 4.9 distributions outside of Android do not generate correct code
 // for this function.
@@ -33,1123 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
 
 #else
 
-#define LOAD_INCREMENT(src, stride, dest, index) \
-  do {                                           \
-    dest[index] = vld1q_s16(src);                \
-    src += stride;                               \
-  } while (0)
-
-#define ADD_S16(src, index0, index1, dest, index3)      \
-  do {                                                  \
-    dest[index3] = vaddq_s16(src[index0], src[index1]); \
-  } while (0)
-
-#define ADD_SHIFT_S16(src, index0, index1)                             \
-  do {                                                                 \
-    src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
-  } while (0)
-
-// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
-// middle
-// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
-  const int16_t *a_end = a + 24 * stride;
-  int16x8_t c[8];
-
-  LOAD_INCREMENT(a, stride, b, 0);
-  LOAD_INCREMENT(a, stride, b, 1);
-  LOAD_INCREMENT(a, stride, b, 2);
-  LOAD_INCREMENT(a, stride, b, 3);
-  LOAD_INCREMENT(a, stride, b, 4);
-  LOAD_INCREMENT(a, stride, b, 5);
-  LOAD_INCREMENT(a, stride, b, 6);
-  LOAD_INCREMENT(a, stride, b, 7);
-
-  LOAD_INCREMENT(a_end, stride, b, 24);
-  LOAD_INCREMENT(a_end, stride, b, 25);
-  LOAD_INCREMENT(a_end, stride, b, 26);
-  LOAD_INCREMENT(a_end, stride, b, 27);
-  LOAD_INCREMENT(a_end, stride, b, 28);
-  LOAD_INCREMENT(a_end, stride, b, 29);
-  LOAD_INCREMENT(a_end, stride, b, 30);
-  LOAD_INCREMENT(a_end, stride, b, 31);
-
-  ADD_S16(b, 0, 31, c, 0);
-  ADD_S16(b, 1, 30, c, 1);
-  ADD_S16(b, 2, 29, c, 2);
-  ADD_S16(b, 3, 28, c, 3);
-  ADD_S16(b, 4, 27, c, 4);
-  ADD_S16(b, 5, 26, c, 5);
-  ADD_S16(b, 6, 25, c, 6);
-  ADD_S16(b, 7, 24, c, 7);
-
-  ADD_SHIFT_S16(b, 7, 24);
-  ADD_SHIFT_S16(b, 6, 25);
-  ADD_SHIFT_S16(b, 5, 26);
-  ADD_SHIFT_S16(b, 4, 27);
-  ADD_SHIFT_S16(b, 3, 28);
-  ADD_SHIFT_S16(b, 2, 29);
-  ADD_SHIFT_S16(b, 1, 30);
-  ADD_SHIFT_S16(b, 0, 31);
-
-  b[0] = vshlq_n_s16(c[0], 2);
-  b[1] = vshlq_n_s16(c[1], 2);
-  b[2] = vshlq_n_s16(c[2], 2);
-  b[3] = vshlq_n_s16(c[3], 2);
-  b[4] = vshlq_n_s16(c[4], 2);
-  b[5] = vshlq_n_s16(c[5], 2);
-  b[6] = vshlq_n_s16(c[6], 2);
-  b[7] = vshlq_n_s16(c[7], 2);
-
-  LOAD_INCREMENT(a, stride, b, 8);
-  LOAD_INCREMENT(a, stride, b, 9);
-  LOAD_INCREMENT(a, stride, b, 10);
-  LOAD_INCREMENT(a, stride, b, 11);
-  LOAD_INCREMENT(a, stride, b, 12);
-  LOAD_INCREMENT(a, stride, b, 13);
-  LOAD_INCREMENT(a, stride, b, 14);
-  LOAD_INCREMENT(a, stride, b, 15);
-  LOAD_INCREMENT(a, stride, b, 16);
-  LOAD_INCREMENT(a, stride, b, 17);
-  LOAD_INCREMENT(a, stride, b, 18);
-  LOAD_INCREMENT(a, stride, b, 19);
-  LOAD_INCREMENT(a, stride, b, 20);
-  LOAD_INCREMENT(a, stride, b, 21);
-  LOAD_INCREMENT(a, stride, b, 22);
-  LOAD_INCREMENT(a, stride, b, 23);
-
-  ADD_S16(b, 8, 23, c, 0);
-  ADD_S16(b, 9, 22, c, 1);
-  ADD_S16(b, 10, 21, c, 2);
-  ADD_S16(b, 11, 20, c, 3);
-  ADD_S16(b, 12, 19, c, 4);
-  ADD_S16(b, 13, 18, c, 5);
-  ADD_S16(b, 14, 17, c, 6);
-  ADD_S16(b, 15, 16, c, 7);
-
-  ADD_SHIFT_S16(b, 15, 16);
-  ADD_SHIFT_S16(b, 14, 17);
-  ADD_SHIFT_S16(b, 13, 18);
-  ADD_SHIFT_S16(b, 12, 19);
-  ADD_SHIFT_S16(b, 11, 20);
-  ADD_SHIFT_S16(b, 10, 21);
-  ADD_SHIFT_S16(b, 9, 22);
-  ADD_SHIFT_S16(b, 8, 23);
-
-  b[8] = vshlq_n_s16(c[0], 2);
-  b[9] = vshlq_n_s16(c[1], 2);
-  b[10] = vshlq_n_s16(c[2], 2);
-  b[11] = vshlq_n_s16(c[3], 2);
-  b[12] = vshlq_n_s16(c[4], 2);
-  b[13] = vshlq_n_s16(c[5], 2);
-  b[14] = vshlq_n_s16(c[6], 2);
-  b[15] = vshlq_n_s16(c[7], 2);
-}
-
-#undef LOAD_INCREMENT
-#undef ADD_S16
-#undef ADD_SHIFT_S16
-
-#define STORE_S16(src, index, dest)           \
-  do {                                        \
-    store_s16q_to_tran_low(dest, src[index]); \
-    dest += 8;                                \
-  } while (0)
-
-// Store 32 16x8 values, assuming stride == 32.
-// Slight twist: store horizontally in blocks of 8.
-static INLINE void store(tran_low_t *a, const int16x8_t *b) {
-  STORE_S16(b, 0, a);
-  STORE_S16(b, 8, a);
-  STORE_S16(b, 16, a);
-  STORE_S16(b, 24, a);
-  STORE_S16(b, 1, a);
-  STORE_S16(b, 9, a);
-  STORE_S16(b, 17, a);
-  STORE_S16(b, 25, a);
-  STORE_S16(b, 2, a);
-  STORE_S16(b, 10, a);
-  STORE_S16(b, 18, a);
-  STORE_S16(b, 26, a);
-  STORE_S16(b, 3, a);
-  STORE_S16(b, 11, a);
-  STORE_S16(b, 19, a);
-  STORE_S16(b, 27, a);
-  STORE_S16(b, 4, a);
-  STORE_S16(b, 12, a);
-  STORE_S16(b, 20, a);
-  STORE_S16(b, 28, a);
-  STORE_S16(b, 5, a);
-  STORE_S16(b, 13, a);
-  STORE_S16(b, 21, a);
-  STORE_S16(b, 29, a);
-  STORE_S16(b, 6, a);
-  STORE_S16(b, 14, a);
-  STORE_S16(b, 22, a);
-  STORE_S16(b, 30, a);
-  STORE_S16(b, 7, a);
-  STORE_S16(b, 15, a);
-  STORE_S16(b, 23, a);
-  STORE_S16(b, 31, a);
-}
-
-#undef STORE_S16
-
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-
-  // Stage 1: Done as part of the load.
-
-  // Stage 2.
-  // Mini cross. X the first 16 values and the middle 8 of the second half.
-  a[0] = vaddq_s16(in[0], in[15]);
-  a[1] = vaddq_s16(in[1], in[14]);
-  a[2] = vaddq_s16(in[2], in[13]);
-  a[3] = vaddq_s16(in[3], in[12]);
-  a[4] = vaddq_s16(in[4], in[11]);
-  a[5] = vaddq_s16(in[5], in[10]);
-  a[6] = vaddq_s16(in[6], in[9]);
-  a[7] = vaddq_s16(in[7], in[8]);
-
-  a[8] = vsubq_s16(in[7], in[8]);
-  a[9] = vsubq_s16(in[6], in[9]);
-  a[10] = vsubq_s16(in[5], in[10]);
-  a[11] = vsubq_s16(in[4], in[11]);
-  a[12] = vsubq_s16(in[3], in[12]);
-  a[13] = vsubq_s16(in[2], in[13]);
-  a[14] = vsubq_s16(in[1], in[14]);
-  a[15] = vsubq_s16(in[0], in[15]);
-
-  a[16] = in[16];
-  a[17] = in[17];
-  a[18] = in[18];
-  a[19] = in[19];
-
-  butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
-  butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
-  butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
-  butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
-
-  a[28] = in[28];
-  a[29] = in[29];
-  a[30] = in[30];
-  a[31] = in[31];
-
-  // Stage 3.
-  b[0] = vaddq_s16(a[0], a[7]);
-  b[1] = vaddq_s16(a[1], a[6]);
-  b[2] = vaddq_s16(a[2], a[5]);
-  b[3] = vaddq_s16(a[3], a[4]);
-
-  b[4] = vsubq_s16(a[3], a[4]);
-  b[5] = vsubq_s16(a[2], a[5]);
-  b[6] = vsubq_s16(a[1], a[6]);
-  b[7] = vsubq_s16(a[0], a[7]);
-
-  b[8] = a[8];
-  b[9] = a[9];
-
-  butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
-  butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
-
-  b[14] = a[14];
-  b[15] = a[15];
-
-  b[16] = vaddq_s16(in[16], a[23]);
-  b[17] = vaddq_s16(in[17], a[22]);
-  b[18] = vaddq_s16(in[18], a[21]);
-  b[19] = vaddq_s16(in[19], a[20]);
-
-  b[20] = vsubq_s16(in[19], a[20]);
-  b[21] = vsubq_s16(in[18], a[21]);
-  b[22] = vsubq_s16(in[17], a[22]);
-  b[23] = vsubq_s16(in[16], a[23]);
-
-  b[24] = vsubq_s16(in[31], a[24]);
-  b[25] = vsubq_s16(in[30], a[25]);
-  b[26] = vsubq_s16(in[29], a[26]);
-  b[27] = vsubq_s16(in[28], a[27]);
-
-  b[28] = vaddq_s16(in[28], a[27]);
-  b[29] = vaddq_s16(in[29], a[26]);
-  b[30] = vaddq_s16(in[30], a[25]);
-  b[31] = vaddq_s16(in[31], a[24]);
-
-  // Stage 4.
-  a[0] = vaddq_s16(b[0], b[3]);
-  a[1] = vaddq_s16(b[1], b[2]);
-  a[2] = vsubq_s16(b[1], b[2]);
-  a[3] = vsubq_s16(b[0], b[3]);
-
-  a[4] = b[4];
-
-  butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
-
-  a[7] = b[7];
-
-  a[8] = vaddq_s16(b[8], b[11]);
-  a[9] = vaddq_s16(b[9], b[10]);
-  a[10] = vsubq_s16(b[9], b[10]);
-  a[11] = vsubq_s16(b[8], b[11]);
-  a[12] = vsubq_s16(b[15], b[12]);
-  a[13] = vsubq_s16(b[14], b[13]);
-  a[14] = vaddq_s16(b[14], b[13]);
-  a[15] = vaddq_s16(b[15], b[12]);
-
-  a[16] = b[16];
-  a[17] = b[17];
-
-  butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
-  butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
-  butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
-  butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
-
-  a[22] = b[22];
-  a[23] = b[23];
-  a[24] = b[24];
-  a[25] = b[25];
-
-  a[30] = b[30];
-  a[31] = b[31];
-
-  // Stage 5.
-  butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
-  butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
-
-  b[4] = vaddq_s16(a[4], a[5]);
-  b[5] = vsubq_s16(a[4], a[5]);
-  b[6] = vsubq_s16(a[7], a[6]);
-  b[7] = vaddq_s16(a[7], a[6]);
-
-  b[8] = a[8];
-
-  butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
-  butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
-
-  b[11] = a[11];
-  b[12] = a[12];
-
-  b[15] = a[15];
-
-  b[16] = vaddq_s16(a[19], a[16]);
-  b[17] = vaddq_s16(a[18], a[17]);
-  b[18] = vsubq_s16(a[17], a[18]);
-  b[19] = vsubq_s16(a[16], a[19]);
-  b[20] = vsubq_s16(a[23], a[20]);
-  b[21] = vsubq_s16(a[22], a[21]);
-  b[22] = vaddq_s16(a[21], a[22]);
-  b[23] = vaddq_s16(a[20], a[23]);
-  b[24] = vaddq_s16(a[27], a[24]);
-  b[25] = vaddq_s16(a[26], a[25]);
-  b[26] = vsubq_s16(a[25], a[26]);
-  b[27] = vsubq_s16(a[24], a[27]);
-  b[28] = vsubq_s16(a[31], a[28]);
-  b[29] = vsubq_s16(a[30], a[29]);
-  b[30] = vaddq_s16(a[29], a[30]);
-  b[31] = vaddq_s16(a[28], a[31]);
-
-  // Stage 6.
-  a[0] = b[0];
-  a[1] = b[1];
-  a[2] = b[2];
-  a[3] = b[3];
-
-  butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
-  butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
-
-  a[8] = vaddq_s16(b[8], b[9]);
-  a[9] = vsubq_s16(b[8], b[9]);
-  a[10] = vsubq_s16(b[11], b[10]);
-  a[11] = vaddq_s16(b[11], b[10]);
-  a[12] = vaddq_s16(b[12], b[13]);
-  a[13] = vsubq_s16(b[12], b[13]);
-  a[14] = vsubq_s16(b[15], b[14]);
-  a[15] = vaddq_s16(b[15], b[14]);
-
-  a[16] = b[16];
-  a[19] = b[19];
-  a[20] = b[20];
-  a[23] = b[23];
-  a[24] = b[24];
-  a[27] = b[27];
-  a[28] = b[28];
-  a[31] = b[31];
-
-  butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
-  butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
-
-  butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
-  butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
-
-  // Stage 7.
-  b[0] = a[0];
-  b[1] = a[1];
-  b[2] = a[2];
-  b[3] = a[3];
-  b[4] = a[4];
-  b[5] = a[5];
-  b[6] = a[6];
-  b[7] = a[7];
-
-  butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
-  butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
-  butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
-  butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
-
-  b[16] = vaddq_s16(a[16], a[17]);
-  b[17] = vsubq_s16(a[16], a[17]);
-  b[18] = vsubq_s16(a[19], a[18]);
-  b[19] = vaddq_s16(a[19], a[18]);
-  b[20] = vaddq_s16(a[20], a[21]);
-  b[21] = vsubq_s16(a[20], a[21]);
-  b[22] = vsubq_s16(a[23], a[22]);
-  b[23] = vaddq_s16(a[23], a[22]);
-  b[24] = vaddq_s16(a[24], a[25]);
-  b[25] = vsubq_s16(a[24], a[25]);
-  b[26] = vsubq_s16(a[27], a[26]);
-  b[27] = vaddq_s16(a[27], a[26]);
-  b[28] = vaddq_s16(a[28], a[29]);
-  b[29] = vsubq_s16(a[28], a[29]);
-  b[30] = vsubq_s16(a[31], a[30]);
-  b[31] = vaddq_s16(a[31], a[30]);
-
-  // Final stage.
-  // Also compute partial rounding shift:
-  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  out[0] = sub_round_shift(b[0]);
-  out[16] = sub_round_shift(b[1]);
-  out[8] = sub_round_shift(b[2]);
-  out[24] = sub_round_shift(b[3]);
-  out[4] = sub_round_shift(b[4]);
-  out[20] = sub_round_shift(b[5]);
-  out[12] = sub_round_shift(b[6]);
-  out[28] = sub_round_shift(b[7]);
-  out[2] = sub_round_shift(b[8]);
-  out[18] = sub_round_shift(b[9]);
-  out[10] = sub_round_shift(b[10]);
-  out[26] = sub_round_shift(b[11]);
-  out[6] = sub_round_shift(b[12]);
-  out[22] = sub_round_shift(b[13]);
-  out[14] = sub_round_shift(b[14]);
-  out[30] = sub_round_shift(b[15]);
-
-  butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
-  out[1] = sub_round_shift(a[1]);
-  out[31] = sub_round_shift(a[31]);
-
-  butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
-  out[17] = sub_round_shift(a[17]);
-  out[15] = sub_round_shift(a[15]);
-
-  butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
-  out[9] = sub_round_shift(a[9]);
-  out[23] = sub_round_shift(a[23]);
-
-  butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
-  out[25] = sub_round_shift(a[25]);
-  out[7] = sub_round_shift(a[7]);
-
-  butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
-  out[5] = sub_round_shift(a[5]);
-  out[27] = sub_round_shift(a[27]);
-
-  butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
-  out[21] = sub_round_shift(a[21]);
-  out[11] = sub_round_shift(a[11]);
-
-  butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
-  out[13] = sub_round_shift(a[13]);
-  out[19] = sub_round_shift(a[19]);
-
-  butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
-  out[29] = sub_round_shift(a[29]);
-  out[3] = sub_round_shift(a[3]);
-}
-
-#define PASS_THROUGH(src, dst, element)    \
-  do {                                     \
-    dst##_lo[element] = src##_lo[element]; \
-    dst##_hi[element] = src##_hi[element]; \
-  } while (0)
-
-#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                        \
-    b##_lo[b_index] =                                                         \
-        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
-    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
-                                vget_high_s16(a[right_index]));               \
-  } while (0)
-
-#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                        \
-    b##_lo[b_index] =                                                         \
-        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
-    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
-                                vget_high_s16(a[right_index]));               \
-  } while (0)
-
-#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
-  do {                                                                       \
-    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
-    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
-  } while (0)
-
-#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
-  do {                                                                     \
-    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
-    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
-    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
-    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
-  } while (0)
-
-#define ADD_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                    \
-    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
-    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
-  } while (0)
-
-#define SUB_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                    \
-    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
-    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
-  } while (0)
-
-#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
-                              add_index, sub_index)                      \
-  do {                                                                   \
-    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
-                                &b##_lo[add_index], &b##_hi[add_index],  \
-                                &b##_lo[sub_index], &b##_hi[sub_index]); \
-  } while (0)
-
-#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
-                          sub_index)                                          \
-  do {                                                                        \
-    butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
-                            a##_lo[right_index], a##_hi[right_index],         \
-                            constant, &b##_lo[add_index], &b##_hi[add_index], \
-                            &b##_lo[sub_index], &b##_hi[sub_index]);          \
-  } while (0)
-
-#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
-                          right_constant, b, add_index, sub_index)             \
-  do {                                                                         \
-    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
-                            a##_lo[right_index], a##_hi[right_index],          \
-                            left_constant, right_constant, &b##_lo[add_index], \
-                            &b##_hi[add_index], &b##_lo[sub_index],            \
-                            &b##_hi[sub_index]);                               \
-  } while (0)
-
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-  int32x4_t c_lo[32];
-  int32x4_t c_hi[32];
-  int32x4_t d_lo[32];
-  int32x4_t d_hi[32];
-
-  // Stage 1. Done as part of the load for the first pass.
-  a[0] = vaddq_s16(in[0], in[31]);
-  a[1] = vaddq_s16(in[1], in[30]);
-  a[2] = vaddq_s16(in[2], in[29]);
-  a[3] = vaddq_s16(in[3], in[28]);
-  a[4] = vaddq_s16(in[4], in[27]);
-  a[5] = vaddq_s16(in[5], in[26]);
-  a[6] = vaddq_s16(in[6], in[25]);
-  a[7] = vaddq_s16(in[7], in[24]);
-  a[8] = vaddq_s16(in[8], in[23]);
-  a[9] = vaddq_s16(in[9], in[22]);
-  a[10] = vaddq_s16(in[10], in[21]);
-  a[11] = vaddq_s16(in[11], in[20]);
-  a[12] = vaddq_s16(in[12], in[19]);
-  a[13] = vaddq_s16(in[13], in[18]);
-  a[14] = vaddq_s16(in[14], in[17]);
-  a[15] = vaddq_s16(in[15], in[16]);
-  a[16] = vsubq_s16(in[15], in[16]);
-  a[17] = vsubq_s16(in[14], in[17]);
-  a[18] = vsubq_s16(in[13], in[18]);
-  a[19] = vsubq_s16(in[12], in[19]);
-  a[20] = vsubq_s16(in[11], in[20]);
-  a[21] = vsubq_s16(in[10], in[21]);
-  a[22] = vsubq_s16(in[9], in[22]);
-  a[23] = vsubq_s16(in[8], in[23]);
-  a[24] = vsubq_s16(in[7], in[24]);
-  a[25] = vsubq_s16(in[6], in[25]);
-  a[26] = vsubq_s16(in[5], in[26]);
-  a[27] = vsubq_s16(in[4], in[27]);
-  a[28] = vsubq_s16(in[3], in[28]);
-  a[29] = vsubq_s16(in[2], in[29]);
-  a[30] = vsubq_s16(in[1], in[30]);
-  a[31] = vsubq_s16(in[0], in[31]);
-
-  // Stage 2.
-  b[0] = vaddq_s16(a[0], a[15]);
-  b[1] = vaddq_s16(a[1], a[14]);
-  b[2] = vaddq_s16(a[2], a[13]);
-  b[3] = vaddq_s16(a[3], a[12]);
-  b[4] = vaddq_s16(a[4], a[11]);
-  b[5] = vaddq_s16(a[5], a[10]);
-  b[6] = vaddq_s16(a[6], a[9]);
-  b[7] = vaddq_s16(a[7], a[8]);
-
-  b[8] = vsubq_s16(a[7], a[8]);
-  b[9] = vsubq_s16(a[6], a[9]);
-  b[10] = vsubq_s16(a[5], a[10]);
-  b[11] = vsubq_s16(a[4], a[11]);
-  b[12] = vsubq_s16(a[3], a[12]);
-  b[13] = vsubq_s16(a[2], a[13]);
-  b[14] = vsubq_s16(a[1], a[14]);
-  b[15] = vsubq_s16(a[0], a[15]);
-
-  b[16] = a[16];
-  b[17] = a[17];
-  b[18] = a[18];
-  b[19] = a[19];
-
-  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
-  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
-  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
-  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-
-  b[28] = a[28];
-  b[29] = a[29];
-  b[30] = a[30];
-  b[31] = a[31];
-
-  // Stage 3. With extreme values for input this calculation rolls over int16_t.
-  // The sources for b[0] get added multiple times and, through testing, have
-  // been shown to overflow starting here.
-  ADD_S16_S32(b, 0, 7, c, 0);
-  ADD_S16_S32(b, 1, 6, c, 1);
-  ADD_S16_S32(b, 2, 5, c, 2);
-  ADD_S16_S32(b, 3, 4, c, 3);
-  SUB_S16_S32(b, 3, 4, c, 4);
-  SUB_S16_S32(b, 2, 5, c, 5);
-  SUB_S16_S32(b, 1, 6, c, 6);
-  SUB_S16_S32(b, 0, 7, c, 7);
-
-  a[8] = b[8];
-  a[9] = b[9];
-
-  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
-  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
-
-  a[14] = b[14];
-  a[15] = b[15];
-
-  ADD_S16_S32(b, 16, 23, c, 16);
-  ADD_S16_S32(b, 17, 22, c, 17);
-  ADD_S16_S32(b, 18, 21, c, 18);
-  ADD_S16_S32(b, 19, 20, c, 19);
-  SUB_S16_S32(b, 19, 20, c, 20);
-  SUB_S16_S32(b, 18, 21, c, 21);
-  SUB_S16_S32(b, 17, 22, c, 22);
-  SUB_S16_S32(b, 16, 23, c, 23);
-  SUB_S16_S32(b, 31, 24, c, 24);
-  SUB_S16_S32(b, 30, 25, c, 25);
-  SUB_S16_S32(b, 29, 26, c, 26);
-  SUB_S16_S32(b, 28, 27, c, 27);
-  ADD_S16_S32(b, 28, 27, c, 28);
-  ADD_S16_S32(b, 29, 26, c, 29);
-  ADD_S16_S32(b, 30, 25, c, 30);
-  ADD_S16_S32(b, 31, 24, c, 31);
-
-  // Stage 4.
-  ADD_S32(c, 0, 3, d, 0);
-  ADD_S32(c, 1, 2, d, 1);
-  SUB_S32(c, 1, 2, d, 2);
-  SUB_S32(c, 0, 3, d, 3);
-
-  PASS_THROUGH(c, d, 4);
-
-  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
-
-  PASS_THROUGH(c, d, 7);
-
-  ADDW_S16_S32(c, 11, a, 8, d, 8);
-  ADDW_S16_S32(c, 10, a, 9, d, 9);
-  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
-  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
-  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
-  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
-  ADDW_S16_S32(c, 13, b, 14, d, 14);
-  ADDW_S16_S32(c, 12, b, 15, d, 15);
-
-  PASS_THROUGH(c, d, 16);
-  PASS_THROUGH(c, d, 17);
-
-  BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
-  BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
-  BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
-  BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
-
-  PASS_THROUGH(c, d, 22);
-  PASS_THROUGH(c, d, 23);
-  PASS_THROUGH(c, d, 24);
-  PASS_THROUGH(c, d, 25);
-
-  PASS_THROUGH(c, d, 30);
-  PASS_THROUGH(c, d, 31);
-
-  // Stage 5.
-  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
-  BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
-
-  ADD_S32(d, 4, 5, c, 4);
-  SUB_S32(d, 4, 5, c, 5);
-  SUB_S32(d, 7, 6, c, 6);
-  ADD_S32(d, 7, 6, c, 7);
-
-  PASS_THROUGH(d, c, 8);
-
-  BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
-  BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
-
-  PASS_THROUGH(d, c, 11);
-  PASS_THROUGH(d, c, 12);
-  PASS_THROUGH(d, c, 15);
-
-  ADD_S32(d, 16, 19, c, 16);
-  ADD_S32(d, 17, 18, c, 17);
-  SUB_S32(d, 17, 18, c, 18);
-  SUB_S32(d, 16, 19, c, 19);
-  SUB_S32(d, 23, 20, c, 20);
-  SUB_S32(d, 22, 21, c, 21);
-  ADD_S32(d, 22, 21, c, 22);
-  ADD_S32(d, 23, 20, c, 23);
-  ADD_S32(d, 24, 27, c, 24);
-  ADD_S32(d, 25, 26, c, 25);
-  SUB_S32(d, 25, 26, c, 26);
-  SUB_S32(d, 24, 27, c, 27);
-  SUB_S32(d, 31, 28, c, 28);
-  SUB_S32(d, 30, 29, c, 29);
-  ADD_S32(d, 30, 29, c, 30);
-  ADD_S32(d, 31, 28, c, 31);
-
-  // Stage 6.
-  PASS_THROUGH(c, d, 0);
-  PASS_THROUGH(c, d, 1);
-  PASS_THROUGH(c, d, 2);
-  PASS_THROUGH(c, d, 3);
-
-  BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
-  BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
-
-  ADD_S32(c, 8, 9, d, 8);
-  SUB_S32(c, 8, 9, d, 9);
-  SUB_S32(c, 11, 10, d, 10);
-  ADD_S32(c, 11, 10, d, 11);
-  ADD_S32(c, 12, 13, d, 12);
-  SUB_S32(c, 12, 13, d, 13);
-  SUB_S32(c, 15, 14, d, 14);
-  ADD_S32(c, 15, 14, d, 15);
-
-  PASS_THROUGH(c, d, 16);
-  PASS_THROUGH(c, d, 19);
-  PASS_THROUGH(c, d, 20);
-  PASS_THROUGH(c, d, 23);
-  PASS_THROUGH(c, d, 24);
-  PASS_THROUGH(c, d, 27);
-  PASS_THROUGH(c, d, 28);
-  PASS_THROUGH(c, d, 31);
-
-  BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
-  BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
-  BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
-  BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
-
-  // Stage 7.
-  PASS_THROUGH(d, c, 0);
-  PASS_THROUGH(d, c, 1);
-  PASS_THROUGH(d, c, 2);
-  PASS_THROUGH(d, c, 3);
-  PASS_THROUGH(d, c, 4);
-  PASS_THROUGH(d, c, 5);
-  PASS_THROUGH(d, c, 6);
-  PASS_THROUGH(d, c, 7);
-
-  BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
-  BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
-  BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
-  BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
-
-  ADD_S32(d, 16, 17, c, 16);
-  SUB_S32(d, 16, 17, c, 17);
-  SUB_S32(d, 19, 18, c, 18);
-  ADD_S32(d, 19, 18, c, 19);
-  ADD_S32(d, 20, 21, c, 20);
-  SUB_S32(d, 20, 21, c, 21);
-  SUB_S32(d, 23, 22, c, 22);
-  ADD_S32(d, 23, 22, c, 23);
-  ADD_S32(d, 24, 25, c, 24);
-  SUB_S32(d, 24, 25, c, 25);
-  SUB_S32(d, 27, 26, c, 26);
-  ADD_S32(d, 27, 26, c, 27);
-  ADD_S32(d, 28, 29, c, 28);
-  SUB_S32(d, 28, 29, c, 29);
-  SUB_S32(d, 31, 30, c, 30);
-  ADD_S32(d, 31, 30, c, 31);
-
-  // Final stage.
-  // Roll rounding into this function so we can pass back int16x8.
-
-  out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
-  out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
-
-  out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
-  out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
-  out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
-  out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
-  out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
-
-  out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
-  out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
-  out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
-  out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
-
-  out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
-  out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
-  out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
-  out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
-  out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
-
-  BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
-  out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
-  out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
-
-  BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
-  out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
-  out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
-
-  BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
-  out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
-  out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
-
-  BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
-  out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
-  out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
-
-  BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
-  out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
-  out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
-
-  BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
-  out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
-  out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
-
-  BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
-  out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
-  out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
-
-  BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
-  out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
-  out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
-}
-
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-
-  // Stage 1. Done as part of the load for the first pass.
-  a[0] = vaddq_s16(in[0], in[31]);
-  a[1] = vaddq_s16(in[1], in[30]);
-  a[2] = vaddq_s16(in[2], in[29]);
-  a[3] = vaddq_s16(in[3], in[28]);
-  a[4] = vaddq_s16(in[4], in[27]);
-  a[5] = vaddq_s16(in[5], in[26]);
-  a[6] = vaddq_s16(in[6], in[25]);
-  a[7] = vaddq_s16(in[7], in[24]);
-  a[8] = vaddq_s16(in[8], in[23]);
-  a[9] = vaddq_s16(in[9], in[22]);
-  a[10] = vaddq_s16(in[10], in[21]);
-  a[11] = vaddq_s16(in[11], in[20]);
-  a[12] = vaddq_s16(in[12], in[19]);
-  a[13] = vaddq_s16(in[13], in[18]);
-  a[14] = vaddq_s16(in[14], in[17]);
-  a[15] = vaddq_s16(in[15], in[16]);
-  a[16] = vsubq_s16(in[15], in[16]);
-  a[17] = vsubq_s16(in[14], in[17]);
-  a[18] = vsubq_s16(in[13], in[18]);
-  a[19] = vsubq_s16(in[12], in[19]);
-  a[20] = vsubq_s16(in[11], in[20]);
-  a[21] = vsubq_s16(in[10], in[21]);
-  a[22] = vsubq_s16(in[9], in[22]);
-  a[23] = vsubq_s16(in[8], in[23]);
-  a[24] = vsubq_s16(in[7], in[24]);
-  a[25] = vsubq_s16(in[6], in[25]);
-  a[26] = vsubq_s16(in[5], in[26]);
-  a[27] = vsubq_s16(in[4], in[27]);
-  a[28] = vsubq_s16(in[3], in[28]);
-  a[29] = vsubq_s16(in[2], in[29]);
-  a[30] = vsubq_s16(in[1], in[30]);
-  a[31] = vsubq_s16(in[0], in[31]);
-
-  // Stage 2.
-  // For the "rd" version, all the values are rounded down after stage 2 to keep
-  // the values in 16 bits.
-  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
-  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
-  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
-  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
-  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
-  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
-  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
-  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
-
-  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
-  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
-  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
-  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
-  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
-  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
-  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
-  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
-
-  b[16] = add_round_shift_s16(a[16]);
-  b[17] = add_round_shift_s16(a[17]);
-  b[18] = add_round_shift_s16(a[18]);
-  b[19] = add_round_shift_s16(a[19]);
-
-  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
-  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
-  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
-  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-  b[20] = add_round_shift_s16(b[20]);
-  b[21] = add_round_shift_s16(b[21]);
-  b[22] = add_round_shift_s16(b[22]);
-  b[23] = add_round_shift_s16(b[23]);
-  b[24] = add_round_shift_s16(b[24]);
-  b[25] = add_round_shift_s16(b[25]);
-  b[26] = add_round_shift_s16(b[26]);
-  b[27] = add_round_shift_s16(b[27]);
-
-  b[28] = add_round_shift_s16(a[28]);
-  b[29] = add_round_shift_s16(a[29]);
-  b[30] = add_round_shift_s16(a[30]);
-  b[31] = add_round_shift_s16(a[31]);
-
-  // Stage 3.
-  a[0] = vaddq_s16(b[0], b[7]);
-  a[1] = vaddq_s16(b[1], b[6]);
-  a[2] = vaddq_s16(b[2], b[5]);
-  a[3] = vaddq_s16(b[3], b[4]);
-
-  a[4] = vsubq_s16(b[3], b[4]);
-  a[5] = vsubq_s16(b[2], b[5]);
-  a[6] = vsubq_s16(b[1], b[6]);
-  a[7] = vsubq_s16(b[0], b[7]);
-
-  a[8] = b[8];
-  a[9] = b[9];
-
-  butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
-  butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
-
-  a[14] = b[14];
-  a[15] = b[15];
-
-  a[16] = vaddq_s16(b[16], b[23]);
-  a[17] = vaddq_s16(b[17], b[22]);
-  a[18] = vaddq_s16(b[18], b[21]);
-  a[19] = vaddq_s16(b[19], b[20]);
-
-  a[20] = vsubq_s16(b[19], b[20]);
-  a[21] = vsubq_s16(b[18], b[21]);
-  a[22] = vsubq_s16(b[17], b[22]);
-  a[23] = vsubq_s16(b[16], b[23]);
-
-  a[24] = vsubq_s16(b[31], b[24]);
-  a[25] = vsubq_s16(b[30], b[25]);
-  a[26] = vsubq_s16(b[29], b[26]);
-  a[27] = vsubq_s16(b[28], b[27]);
-
-  a[28] = vaddq_s16(b[28], b[27]);
-  a[29] = vaddq_s16(b[29], b[26]);
-  a[30] = vaddq_s16(b[30], b[25]);
-  a[31] = vaddq_s16(b[31], b[24]);
-
-  // Stage 4.
-  b[0] = vaddq_s16(a[0], a[3]);
-  b[1] = vaddq_s16(a[1], a[2]);
-  b[2] = vsubq_s16(a[1], a[2]);
-  b[3] = vsubq_s16(a[0], a[3]);
-
-  b[4] = a[4];
-
-  butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
-
-  b[7] = a[7];
-
-  b[8] = vaddq_s16(a[8], a[11]);
-  b[9] = vaddq_s16(a[9], a[10]);
-  b[10] = vsubq_s16(a[9], a[10]);
-  b[11] = vsubq_s16(a[8], a[11]);
-  b[12] = vsubq_s16(a[15], a[12]);
-  b[13] = vsubq_s16(a[14], a[13]);
-  b[14] = vaddq_s16(a[14], a[13]);
-  b[15] = vaddq_s16(a[15], a[12]);
-
-  b[16] = a[16];
-  b[17] = a[17];
-
-  butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
-  butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
-  butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
-  butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
-
-  b[22] = a[22];
-  b[23] = a[23];
-  b[24] = a[24];
-  b[25] = a[25];
-
-  b[30] = a[30];
-  b[31] = a[31];
-
-  // Stage 5.
-  butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
-  butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
-
-  a[4] = vaddq_s16(b[4], b[5]);
-  a[5] = vsubq_s16(b[4], b[5]);
-  a[6] = vsubq_s16(b[7], b[6]);
-  a[7] = vaddq_s16(b[7], b[6]);
-
-  a[8] = b[8];
-
-  butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
-  butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
-
-  a[11] = b[11];
-  a[12] = b[12];
-
-  a[15] = b[15];
-
-  a[16] = vaddq_s16(b[19], b[16]);
-  a[17] = vaddq_s16(b[18], b[17]);
-  a[18] = vsubq_s16(b[17], b[18]);
-  a[19] = vsubq_s16(b[16], b[19]);
-  a[20] = vsubq_s16(b[23], b[20]);
-  a[21] = vsubq_s16(b[22], b[21]);
-  a[22] = vaddq_s16(b[21], b[22]);
-  a[23] = vaddq_s16(b[20], b[23]);
-  a[24] = vaddq_s16(b[27], b[24]);
-  a[25] = vaddq_s16(b[26], b[25]);
-  a[26] = vsubq_s16(b[25], b[26]);
-  a[27] = vsubq_s16(b[24], b[27]);
-  a[28] = vsubq_s16(b[31], b[28]);
-  a[29] = vsubq_s16(b[30], b[29]);
-  a[30] = vaddq_s16(b[29], b[30]);
-  a[31] = vaddq_s16(b[28], b[31]);
-
-  // Stage 6.
-  b[0] = a[0];
-  b[1] = a[1];
-  b[2] = a[2];
-  b[3] = a[3];
-
-  butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
-  butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
-
-  b[8] = vaddq_s16(a[8], a[9]);
-  b[9] = vsubq_s16(a[8], a[9]);
-  b[10] = vsubq_s16(a[11], a[10]);
-  b[11] = vaddq_s16(a[11], a[10]);
-  b[12] = vaddq_s16(a[12], a[13]);
-  b[13] = vsubq_s16(a[12], a[13]);
-  b[14] = vsubq_s16(a[15], a[14]);
-  b[15] = vaddq_s16(a[15], a[14]);
-
-  b[16] = a[16];
-  b[19] = a[19];
-  b[20] = a[20];
-  b[23] = a[23];
-  b[24] = a[24];
-  b[27] = a[27];
-  b[28] = a[28];
-  b[31] = a[31];
-
-  butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
-  butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
-
-  butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
-  butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
-
-  // Stage 7.
-  a[0] = b[0];
-  a[1] = b[1];
-  a[2] = b[2];
-  a[3] = b[3];
-  a[4] = b[4];
-  a[5] = b[5];
-  a[6] = b[6];
-  a[7] = b[7];
-
-  butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
-  butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
-  butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
-  butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
-
-  a[16] = vaddq_s16(b[16], b[17]);
-  a[17] = vsubq_s16(b[16], b[17]);
-  a[18] = vsubq_s16(b[19], b[18]);
-  a[19] = vaddq_s16(b[19], b[18]);
-  a[20] = vaddq_s16(b[20], b[21]);
-  a[21] = vsubq_s16(b[20], b[21]);
-  a[22] = vsubq_s16(b[23], b[22]);
-  a[23] = vaddq_s16(b[23], b[22]);
-  a[24] = vaddq_s16(b[24], b[25]);
-  a[25] = vsubq_s16(b[24], b[25]);
-  a[26] = vsubq_s16(b[27], b[26]);
-  a[27] = vaddq_s16(b[27], b[26]);
-  a[28] = vaddq_s16(b[28], b[29]);
-  a[29] = vsubq_s16(b[28], b[29]);
-  a[30] = vsubq_s16(b[31], b[30]);
-  a[31] = vaddq_s16(b[31], b[30]);
-
-  // Final stage.
-  out[0] = a[0];
-  out[16] = a[1];
-  out[8] = a[2];
-  out[24] = a[3];
-  out[4] = a[4];
-  out[20] = a[5];
-  out[12] = a[6];
-  out[28] = a[7];
-  out[2] = a[8];
-  out[18] = a[9];
-  out[10] = a[10];
-  out[26] = a[11];
-  out[6] = a[12];
-  out[22] = a[13];
-  out[14] = a[14];
-  out[30] = a[15];
-
-  butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
-  butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
-                      &out[15]);
-  butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
-  butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
-  butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
-  butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
-                      &out[11]);
-  butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
-                      &out[19]);
-  butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
-}
-
-#undef PASS_THROUGH
-#undef ADD_S16_S32
-#undef SUB_S16_S32
-#undef ADDW_S16_S32
-#undef SUBW_S16_S32
-#undef ADD_S32
-#undef SUB_S32
-#undef BUTTERFLY_ONE_S16_S32
-#undef BUTTERFLY_ONE_S32
-#undef BUTTERFLY_TWO_S32
-
 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[32];
   int16x8_t temp1[32];
@@ -1159,17 +43,21 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp5[32];
 
   // Process in 8x32 columns.
-  load(input, stride, temp0);
-  dct_body_first_pass(temp0, temp1);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
 
-  load(input + 8, stride, temp0);
-  dct_body_first_pass(temp0, temp2);
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
 
-  load(input + 16, stride, temp0);
-  dct_body_first_pass(temp0, temp3);
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
 
-  load(input + 24, stride, temp0);
-  dct_body_first_pass(temp0, temp4);
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
   transpose_s16_8x8_new(&temp1[0], &temp0[0]);
@@ -1254,17 +142,21 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   int16x8_t temp5[32];
 
   // Process in 8x32 columns.
-  load(input, stride, temp0);
-  dct_body_first_pass(temp0, temp1);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
 
-  load(input + 8, stride, temp0);
-  dct_body_first_pass(temp0, temp2);
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
 
-  load(input + 16, stride, temp0);
-  dct_body_first_pass(temp0, temp3);
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
 
-  load(input + 24, stride, temp0);
-  dct_body_first_pass(temp0, temp4);
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
   transpose_s16_8x8_new(&temp1[0], &temp0[0]);
diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 0000000000..dd647918b2
--- /dev/null
+++ b/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,1105 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+  b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+  b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+  b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+  b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+  b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+  b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest)           \
+  do {                                        \
+    store_s16q_to_tran_low(dest, src[index]); \
+    dest += 8;                                \
+  } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  STORE_S16(b, 0, a);
+  STORE_S16(b, 8, a);
+  STORE_S16(b, 16, a);
+  STORE_S16(b, 24, a);
+  STORE_S16(b, 1, a);
+  STORE_S16(b, 9, a);
+  STORE_S16(b, 17, a);
+  STORE_S16(b, 25, a);
+  STORE_S16(b, 2, a);
+  STORE_S16(b, 10, a);
+  STORE_S16(b, 18, a);
+  STORE_S16(b, 26, a);
+  STORE_S16(b, 3, a);
+  STORE_S16(b, 11, a);
+  STORE_S16(b, 19, a);
+  STORE_S16(b, 27, a);
+  STORE_S16(b, 4, a);
+  STORE_S16(b, 12, a);
+  STORE_S16(b, 20, a);
+  STORE_S16(b, 28, a);
+  STORE_S16(b, 5, a);
+  STORE_S16(b, 13, a);
+  STORE_S16(b, 21, a);
+  STORE_S16(b, 29, a);
+  STORE_S16(b, 6, a);
+  STORE_S16(b, 14, a);
+  STORE_S16(b, 22, a);
+  STORE_S16(b, 30, a);
+  STORE_S16(b, 7, a);
+  STORE_S16(b, 15, a);
+  STORE_S16(b, 23, a);
+  STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+                               int16x8_t *out /*32*/) {
+  out[0] = vshlq_n_s16(in[0], 2);
+  out[1] = vshlq_n_s16(in[1], 2);
+  out[2] = vshlq_n_s16(in[2], 2);
+  out[3] = vshlq_n_s16(in[3], 2);
+  out[4] = vshlq_n_s16(in[4], 2);
+  out[5] = vshlq_n_s16(in[5], 2);
+  out[6] = vshlq_n_s16(in[6], 2);
+  out[7] = vshlq_n_s16(in[7], 2);
+
+  out[8] = vshlq_n_s16(in[8], 2);
+  out[9] = vshlq_n_s16(in[9], 2);
+  out[10] = vshlq_n_s16(in[10], 2);
+  out[11] = vshlq_n_s16(in[11], 2);
+  out[12] = vshlq_n_s16(in[12], 2);
+  out[13] = vshlq_n_s16(in[13], 2);
+  out[14] = vshlq_n_s16(in[14], 2);
+  out[15] = vshlq_n_s16(in[15], 2);
+
+  out[16] = vshlq_n_s16(in[16], 2);
+  out[17] = vshlq_n_s16(in[17], 2);
+  out[18] = vshlq_n_s16(in[18], 2);
+  out[19] = vshlq_n_s16(in[19], 2);
+  out[20] = vshlq_n_s16(in[20], 2);
+  out[21] = vshlq_n_s16(in[21], 2);
+  out[22] = vshlq_n_s16(in[22], 2);
+  out[23] = vshlq_n_s16(in[23], 2);
+
+  out[24] = vshlq_n_s16(in[24], 2);
+  out[25] = vshlq_n_s16(in[25], 2);
+  out[26] = vshlq_n_s16(in[26], 2);
+  out[27] = vshlq_n_s16(in[27], 2);
+  out[28] = vshlq_n_s16(in[28], 2);
+  out[29] = vshlq_n_s16(in[29], 2);
+  out[30] = vshlq_n_s16(in[30], 2);
+  out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  a[0] = vaddq_s16(in[0], in[15]);
+  a[1] = vaddq_s16(in[1], in[14]);
+  a[2] = vaddq_s16(in[2], in[13]);
+  a[3] = vaddq_s16(in[3], in[12]);
+  a[4] = vaddq_s16(in[4], in[11]);
+  a[5] = vaddq_s16(in[5], in[10]);
+  a[6] = vaddq_s16(in[6], in[9]);
+  a[7] = vaddq_s16(in[7], in[8]);
+
+  a[8] = vsubq_s16(in[7], in[8]);
+  a[9] = vsubq_s16(in[6], in[9]);
+  a[10] = vsubq_s16(in[5], in[10]);
+  a[11] = vsubq_s16(in[4], in[11]);
+  a[12] = vsubq_s16(in[3], in[12]);
+  a[13] = vsubq_s16(in[2], in[13]);
+  a[14] = vsubq_s16(in[1], in[14]);
+  a[15] = vsubq_s16(in[0], in[15]);
+
+  a[16] = in[16];
+  a[17] = in[17];
+  a[18] = in[18];
+  a[19] = in[19];
+
+  butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+                                     &a[20]);
+  butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+                                     &a[21]);
+  butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+                                     &a[22]);
+  butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+                                     &a[23]);
+
+  a[28] = in[28];
+  a[29] = in[29];
+  a[30] = in[30];
+  a[31] = in[31];
+
+  // Stage 3.
+  b[0] = vaddq_s16(a[0], a[7]);
+  b[1] = vaddq_s16(a[1], a[6]);
+  b[2] = vaddq_s16(a[2], a[5]);
+  b[3] = vaddq_s16(a[3], a[4]);
+
+  b[4] = vsubq_s16(a[3], a[4]);
+  b[5] = vsubq_s16(a[2], a[5]);
+  b[6] = vsubq_s16(a[1], a[6]);
+  b[7] = vsubq_s16(a[0], a[7]);
+
+  b[8] = a[8];
+  b[9] = a[9];
+
+  butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+  butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+  b[14] = a[14];
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(in[16], a[23]);
+  b[17] = vaddq_s16(in[17], a[22]);
+  b[18] = vaddq_s16(in[18], a[21]);
+  b[19] = vaddq_s16(in[19], a[20]);
+
+  b[20] = vsubq_s16(in[19], a[20]);
+  b[21] = vsubq_s16(in[18], a[21]);
+  b[22] = vsubq_s16(in[17], a[22]);
+  b[23] = vsubq_s16(in[16], a[23]);
+
+  b[24] = vsubq_s16(in[31], a[24]);
+  b[25] = vsubq_s16(in[30], a[25]);
+  b[26] = vsubq_s16(in[29], a[26]);
+  b[27] = vsubq_s16(in[28], a[27]);
+
+  b[28] = vaddq_s16(in[28], a[27]);
+  b[29] = vaddq_s16(in[29], a[26]);
+  b[30] = vaddq_s16(in[30], a[25]);
+  b[31] = vaddq_s16(in[31], a[24]);
+
+  // Stage 4.
+  a[0] = vaddq_s16(b[0], b[3]);
+  a[1] = vaddq_s16(b[1], b[2]);
+  a[2] = vsubq_s16(b[1], b[2]);
+  a[3] = vsubq_s16(b[0], b[3]);
+
+  a[4] = b[4];
+
+  butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+  a[7] = b[7];
+
+  a[8] = vaddq_s16(b[8], b[11]);
+  a[9] = vaddq_s16(b[9], b[10]);
+  a[10] = vsubq_s16(b[9], b[10]);
+  a[11] = vsubq_s16(b[8], b[11]);
+  a[12] = vsubq_s16(b[15], b[12]);
+  a[13] = vsubq_s16(b[14], b[13]);
+  a[14] = vaddq_s16(b[14], b[13]);
+  a[15] = vaddq_s16(b[15], b[12]);
+
+  a[16] = b[16];
+  a[17] = b[17];
+
+  butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+  butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+  butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+  butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+  a[22] = b[22];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[25] = b[25];
+
+  a[30] = b[30];
+  a[31] = b[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+  butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+  b[4] = vaddq_s16(a[4], a[5]);
+  b[5] = vsubq_s16(a[4], a[5]);
+  b[6] = vsubq_s16(a[7], a[6]);
+  b[7] = vaddq_s16(a[7], a[6]);
+
+  b[8] = a[8];
+
+  butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+  butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+  b[11] = a[11];
+  b[12] = a[12];
+
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(a[19], a[16]);
+  b[17] = vaddq_s16(a[18], a[17]);
+  b[18] = vsubq_s16(a[17], a[18]);
+  b[19] = vsubq_s16(a[16], a[19]);
+  b[20] = vsubq_s16(a[23], a[20]);
+  b[21] = vsubq_s16(a[22], a[21]);
+  b[22] = vaddq_s16(a[21], a[22]);
+  b[23] = vaddq_s16(a[20], a[23]);
+  b[24] = vaddq_s16(a[27], a[24]);
+  b[25] = vaddq_s16(a[26], a[25]);
+  b[26] = vsubq_s16(a[25], a[26]);
+  b[27] = vsubq_s16(a[24], a[27]);
+  b[28] = vsubq_s16(a[31], a[28]);
+  b[29] = vsubq_s16(a[30], a[29]);
+  b[30] = vaddq_s16(a[29], a[30]);
+  b[31] = vaddq_s16(a[28], a[31]);
+
+  // Stage 6.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+  butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+  butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+  a[8] = vaddq_s16(b[8], b[9]);
+  a[9] = vsubq_s16(b[8], b[9]);
+  a[10] = vsubq_s16(b[11], b[10]);
+  a[11] = vaddq_s16(b[11], b[10]);
+  a[12] = vaddq_s16(b[12], b[13]);
+  a[13] = vsubq_s16(b[12], b[13]);
+  a[14] = vsubq_s16(b[15], b[14]);
+  a[15] = vaddq_s16(b[15], b[14]);
+
+  a[16] = b[16];
+  a[19] = b[19];
+  a[20] = b[20];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[27] = b[27];
+  a[28] = b[28];
+  a[31] = b[31];
+
+  butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+  butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+  butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+  // Stage 7.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+  b[4] = a[4];
+  b[5] = a[5];
+  b[6] = a[6];
+  b[7] = a[7];
+
+  butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+  butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+  butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+  butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+  b[16] = vaddq_s16(a[16], a[17]);
+  b[17] = vsubq_s16(a[16], a[17]);
+  b[18] = vsubq_s16(a[19], a[18]);
+  b[19] = vaddq_s16(a[19], a[18]);
+  b[20] = vaddq_s16(a[20], a[21]);
+  b[21] = vsubq_s16(a[20], a[21]);
+  b[22] = vsubq_s16(a[23], a[22]);
+  b[23] = vaddq_s16(a[23], a[22]);
+  b[24] = vaddq_s16(a[24], a[25]);
+  b[25] = vsubq_s16(a[24], a[25]);
+  b[26] = vsubq_s16(a[27], a[26]);
+  b[27] = vaddq_s16(a[27], a[26]);
+  b[28] = vaddq_s16(a[28], a[29]);
+  b[29] = vsubq_s16(a[28], a[29]);
+  b[30] = vsubq_s16(a[31], a[30]);
+  b[31] = vaddq_s16(a[31], a[30]);
+
+  // Final stage.
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  out[0] = sub_round_shift_s16(b[0]);
+  out[16] = sub_round_shift_s16(b[1]);
+  out[8] = sub_round_shift_s16(b[2]);
+  out[24] = sub_round_shift_s16(b[3]);
+  out[4] = sub_round_shift_s16(b[4]);
+  out[20] = sub_round_shift_s16(b[5]);
+  out[12] = sub_round_shift_s16(b[6]);
+  out[28] = sub_round_shift_s16(b[7]);
+  out[2] = sub_round_shift_s16(b[8]);
+  out[18] = sub_round_shift_s16(b[9]);
+  out[10] = sub_round_shift_s16(b[10]);
+  out[26] = sub_round_shift_s16(b[11]);
+  out[6] = sub_round_shift_s16(b[12]);
+  out[22] = sub_round_shift_s16(b[13]);
+  out[14] = sub_round_shift_s16(b[14]);
+  out[30] = sub_round_shift_s16(b[15]);
+
+  butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+  out[1] = sub_round_shift_s16(a[1]);
+  out[31] = sub_round_shift_s16(a[31]);
+
+  butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+  out[17] = sub_round_shift_s16(a[17]);
+  out[15] = sub_round_shift_s16(a[15]);
+
+  butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+  out[9] = sub_round_shift_s16(a[9]);
+  out[23] = sub_round_shift_s16(a[23]);
+
+  butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+  out[25] = sub_round_shift_s16(a[25]);
+  out[7] = sub_round_shift_s16(a[7]);
+
+  butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+  out[5] = sub_round_shift_s16(a[5]);
+  out[27] = sub_round_shift_s16(a[27]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+  out[21] = sub_round_shift_s16(a[21]);
+  out[11] = sub_round_shift_s16(a[11]);
+
+  butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+  out[13] = sub_round_shift_s16(a[13]);
+  out[19] = sub_round_shift_s16(a[19]);
+
+  butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+  out[29] = sub_round_shift_s16(a[29]);
+  out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element)    \
+  do {                                     \
+    dst##_lo[element] = src##_lo[element]; \
+    dst##_hi[element] = src##_hi[element]; \
+  } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
+  do {                                                                       \
+    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
+    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+  } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+  do {                                                                     \
+    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
+    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
+    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
+    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
+  } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
+                              add_index, sub_index)                      \
+  do {                                                                   \
+    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+                                &b##_lo[add_index], &b##_hi[add_index],  \
+                                &b##_lo[sub_index], &b##_hi[sub_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index,  \
+                          sub_index)                                           \
+  do {                                                                         \
+    butterfly_one_coeff_s32_fast(                                              \
+        a##_lo[left_index], a##_hi[left_index], a##_lo[right_index],           \
+        a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+        &b##_lo[sub_index], &b##_hi[sub_index]);                               \
+  } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
+                          right_constant, b, add_index, sub_index)             \
+  do {                                                                         \
+    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
+                            a##_lo[right_index], a##_hi[right_index],          \
+                            left_constant, right_constant, &b##_lo[add_index], \
+                            &b##_hi[add_index], &b##_lo[sub_index],            \
+                            &b##_hi[sub_index]);                               \
+  } while (0)
+
+static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+  int32x4_t c_lo[32];
+  int32x4_t c_hi[32];
+  int32x4_t d_lo[32];
+  int32x4_t d_hi[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+  b[18] = a[18];
+  b[19] = a[19];
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+  b[28] = a[28];
+  b[29] = a[29];
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 3. With extreme values for input this calculation rolls over int16_t.
+  // The sources for b[0] get added multiple times and, through testing, have
+  // been shown to overflow starting here.
+  ADD_S16_S32(b, 0, 7, c, 0);
+  ADD_S16_S32(b, 1, 6, c, 1);
+  ADD_S16_S32(b, 2, 5, c, 2);
+  ADD_S16_S32(b, 3, 4, c, 3);
+  SUB_S16_S32(b, 3, 4, c, 4);
+  SUB_S16_S32(b, 2, 5, c, 5);
+  SUB_S16_S32(b, 1, 6, c, 6);
+  SUB_S16_S32(b, 0, 7, c, 7);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  ADD_S16_S32(b, 16, 23, c, 16);
+  ADD_S16_S32(b, 17, 22, c, 17);
+  ADD_S16_S32(b, 18, 21, c, 18);
+  ADD_S16_S32(b, 19, 20, c, 19);
+  SUB_S16_S32(b, 19, 20, c, 20);
+  SUB_S16_S32(b, 18, 21, c, 21);
+  SUB_S16_S32(b, 17, 22, c, 22);
+  SUB_S16_S32(b, 16, 23, c, 23);
+  SUB_S16_S32(b, 31, 24, c, 24);
+  SUB_S16_S32(b, 30, 25, c, 25);
+  SUB_S16_S32(b, 29, 26, c, 26);
+  SUB_S16_S32(b, 28, 27, c, 27);
+  ADD_S16_S32(b, 28, 27, c, 28);
+  ADD_S16_S32(b, 29, 26, c, 29);
+  ADD_S16_S32(b, 30, 25, c, 30);
+  ADD_S16_S32(b, 31, 24, c, 31);
+
+  // Stage 4.
+  ADD_S32(c, 0, 3, d, 0);
+  ADD_S32(c, 1, 2, d, 1);
+  SUB_S32(c, 1, 2, d, 2);
+  SUB_S32(c, 0, 3, d, 3);
+
+  PASS_THROUGH(c, d, 4);
+
+  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+  PASS_THROUGH(c, d, 7);
+
+  ADDW_S16_S32(c, 11, a, 8, d, 8);
+  ADDW_S16_S32(c, 10, a, 9, d, 9);
+  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+  ADDW_S16_S32(c, 13, b, 14, d, 14);
+  ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 17);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+  PASS_THROUGH(c, d, 22);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 25);
+
+  PASS_THROUGH(c, d, 30);
+  PASS_THROUGH(c, d, 31);
+
+  // Stage 5.
+  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+  BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+  ADD_S32(d, 4, 5, c, 4);
+  SUB_S32(d, 4, 5, c, 5);
+  SUB_S32(d, 7, 6, c, 6);
+  ADD_S32(d, 7, 6, c, 7);
+
+  PASS_THROUGH(d, c, 8);
+
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+  PASS_THROUGH(d, c, 11);
+  PASS_THROUGH(d, c, 12);
+  PASS_THROUGH(d, c, 15);
+
+  ADD_S32(d, 16, 19, c, 16);
+  ADD_S32(d, 17, 18, c, 17);
+  SUB_S32(d, 17, 18, c, 18);
+  SUB_S32(d, 16, 19, c, 19);
+  SUB_S32(d, 23, 20, c, 20);
+  SUB_S32(d, 22, 21, c, 21);
+  ADD_S32(d, 22, 21, c, 22);
+  ADD_S32(d, 23, 20, c, 23);
+  ADD_S32(d, 24, 27, c, 24);
+  ADD_S32(d, 25, 26, c, 25);
+  SUB_S32(d, 25, 26, c, 26);
+  SUB_S32(d, 24, 27, c, 27);
+  SUB_S32(d, 31, 28, c, 28);
+  SUB_S32(d, 30, 29, c, 29);
+  ADD_S32(d, 30, 29, c, 30);
+  ADD_S32(d, 31, 28, c, 31);
+
+  // Stage 6.
+  PASS_THROUGH(c, d, 0);
+  PASS_THROUGH(c, d, 1);
+  PASS_THROUGH(c, d, 2);
+  PASS_THROUGH(c, d, 3);
+
+  BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+  BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+  ADD_S32(c, 8, 9, d, 8);
+  SUB_S32(c, 8, 9, d, 9);
+  SUB_S32(c, 11, 10, d, 10);
+  ADD_S32(c, 11, 10, d, 11);
+  ADD_S32(c, 12, 13, d, 12);
+  SUB_S32(c, 12, 13, d, 13);
+  SUB_S32(c, 15, 14, d, 14);
+  ADD_S32(c, 15, 14, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 19);
+  PASS_THROUGH(c, d, 20);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 27);
+  PASS_THROUGH(c, d, 28);
+  PASS_THROUGH(c, d, 31);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+  // Stage 7.
+  PASS_THROUGH(d, c, 0);
+  PASS_THROUGH(d, c, 1);
+  PASS_THROUGH(d, c, 2);
+  PASS_THROUGH(d, c, 3);
+  PASS_THROUGH(d, c, 4);
+  PASS_THROUGH(d, c, 5);
+  PASS_THROUGH(d, c, 6);
+  PASS_THROUGH(d, c, 7);
+
+  BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+  BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+  ADD_S32(d, 16, 17, c, 16);
+  SUB_S32(d, 16, 17, c, 17);
+  SUB_S32(d, 19, 18, c, 18);
+  ADD_S32(d, 19, 18, c, 19);
+  ADD_S32(d, 20, 21, c, 20);
+  SUB_S32(d, 20, 21, c, 21);
+  SUB_S32(d, 23, 22, c, 22);
+  ADD_S32(d, 23, 22, c, 23);
+  ADD_S32(d, 24, 25, c, 24);
+  SUB_S32(d, 24, 25, c, 25);
+  SUB_S32(d, 27, 26, c, 26);
+  ADD_S32(d, 27, 26, c, 27);
+  ADD_S32(d, 28, 29, c, 28);
+  SUB_S32(d, 28, 29, c, 29);
+  SUB_S32(d, 31, 30, c, 30);
+  ADD_S32(d, 31, 30, c, 31);
+
+  // Final stage.
+  // Roll rounding into this function so we can pass back int16x8.
+
+  out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+  out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+  out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+  out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+  out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+  out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+  out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+  out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+  out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+  out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+  out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+  out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+  out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+  out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+  out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+  out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+  out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+  out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+  out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+  out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+  out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+  out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+  out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+  out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+  out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+  out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+  out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+  out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+  out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+  out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+  BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+  out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+  out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+  b[16] = add_round_shift_s16(a[16]);
+  b[17] = add_round_shift_s16(a[17]);
+  b[18] = add_round_shift_s16(a[18]);
+  b[19] = add_round_shift_s16(a[19]);
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+  b[20] = add_round_shift_s16(b[20]);
+  b[21] = add_round_shift_s16(b[21]);
+  b[22] = add_round_shift_s16(b[22]);
+  b[23] = add_round_shift_s16(b[23]);
+  b[24] = add_round_shift_s16(b[24]);
+  b[25] = add_round_shift_s16(b[25]);
+  b[26] = add_round_shift_s16(b[26]);
+  b[27] = add_round_shift_s16(b[27]);
+
+  b[28] = add_round_shift_s16(a[28]);
+  b[29] = add_round_shift_s16(a[29]);
+  b[30] = add_round_shift_s16(a[30]);
+  b[31] = add_round_shift_s16(a[31]);
+
+  // Stage 3.
+  a[0] = vaddq_s16(b[0], b[7]);
+  a[1] = vaddq_s16(b[1], b[6]);
+  a[2] = vaddq_s16(b[2], b[5]);
+  a[3] = vaddq_s16(b[3], b[4]);
+
+  a[4] = vsubq_s16(b[3], b[4]);
+  a[5] = vsubq_s16(b[2], b[5]);
+  a[6] = vsubq_s16(b[1], b[6]);
+  a[7] = vsubq_s16(b[0], b[7]);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+  butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[16], b[23]);
+  a[17] = vaddq_s16(b[17], b[22]);
+  a[18] = vaddq_s16(b[18], b[21]);
+  a[19] = vaddq_s16(b[19], b[20]);
+
+  a[20] = vsubq_s16(b[19], b[20]);
+  a[21] = vsubq_s16(b[18], b[21]);
+  a[22] = vsubq_s16(b[17], b[22]);
+  a[23] = vsubq_s16(b[16], b[23]);
+
+  a[24] = vsubq_s16(b[31], b[24]);
+  a[25] = vsubq_s16(b[30], b[25]);
+  a[26] = vsubq_s16(b[29], b[26]);
+  a[27] = vsubq_s16(b[28], b[27]);
+
+  a[28] = vaddq_s16(b[28], b[27]);
+  a[29] = vaddq_s16(b[29], b[26]);
+  a[30] = vaddq_s16(b[30], b[25]);
+  a[31] = vaddq_s16(b[31], b[24]);
+
+  // Stage 4.
+  b[0] = vaddq_s16(a[0], a[3]);
+  b[1] = vaddq_s16(a[1], a[2]);
+  b[2] = vsubq_s16(a[1], a[2]);
+  b[3] = vsubq_s16(a[0], a[3]);
+
+  b[4] = a[4];
+
+  butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+  b[7] = a[7];
+
+  b[8] = vaddq_s16(a[8], a[11]);
+  b[9] = vaddq_s16(a[9], a[10]);
+  b[10] = vsubq_s16(a[9], a[10]);
+  b[11] = vsubq_s16(a[8], a[11]);
+  b[12] = vsubq_s16(a[15], a[12]);
+  b[13] = vsubq_s16(a[14], a[13]);
+  b[14] = vaddq_s16(a[14], a[13]);
+  b[15] = vaddq_s16(a[15], a[12]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+
+  butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+  butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+  butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+  butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+  b[22] = a[22];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[25] = a[25];
+
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+  butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+  a[4] = vaddq_s16(b[4], b[5]);
+  a[5] = vsubq_s16(b[4], b[5]);
+  a[6] = vsubq_s16(b[7], b[6]);
+  a[7] = vaddq_s16(b[7], b[6]);
+
+  a[8] = b[8];
+
+  butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+  butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+  a[11] = b[11];
+  a[12] = b[12];
+
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[19], b[16]);
+  a[17] = vaddq_s16(b[18], b[17]);
+  a[18] = vsubq_s16(b[17], b[18]);
+  a[19] = vsubq_s16(b[16], b[19]);
+  a[20] = vsubq_s16(b[23], b[20]);
+  a[21] = vsubq_s16(b[22], b[21]);
+  a[22] = vaddq_s16(b[21], b[22]);
+  a[23] = vaddq_s16(b[20], b[23]);
+  a[24] = vaddq_s16(b[27], b[24]);
+  a[25] = vaddq_s16(b[26], b[25]);
+  a[26] = vsubq_s16(b[25], b[26]);
+  a[27] = vsubq_s16(b[24], b[27]);
+  a[28] = vsubq_s16(b[31], b[28]);
+  a[29] = vsubq_s16(b[30], b[29]);
+  a[30] = vaddq_s16(b[29], b[30]);
+  a[31] = vaddq_s16(b[28], b[31]);
+
+  // Stage 6.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+
+  butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+  butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+  b[8] = vaddq_s16(a[8], a[9]);
+  b[9] = vsubq_s16(a[8], a[9]);
+  b[10] = vsubq_s16(a[11], a[10]);
+  b[11] = vaddq_s16(a[11], a[10]);
+  b[12] = vaddq_s16(a[12], a[13]);
+  b[13] = vsubq_s16(a[12], a[13]);
+  b[14] = vsubq_s16(a[15], a[14]);
+  b[15] = vaddq_s16(a[15], a[14]);
+
+  b[16] = a[16];
+  b[19] = a[19];
+  b[20] = a[20];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[27] = a[27];
+  b[28] = a[28];
+  b[31] = a[31];
+
+  butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+  butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+  butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+  butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+  // Stage 7.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+
+  butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+  butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+  butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+  butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+  a[16] = vaddq_s16(b[16], b[17]);
+  a[17] = vsubq_s16(b[16], b[17]);
+  a[18] = vsubq_s16(b[19], b[18]);
+  a[19] = vaddq_s16(b[19], b[18]);
+  a[20] = vaddq_s16(b[20], b[21]);
+  a[21] = vsubq_s16(b[20], b[21]);
+  a[22] = vsubq_s16(b[23], b[22]);
+  a[23] = vaddq_s16(b[23], b[22]);
+  a[24] = vaddq_s16(b[24], b[25]);
+  a[25] = vsubq_s16(b[24], b[25]);
+  a[26] = vsubq_s16(b[27], b[26]);
+  a[27] = vaddq_s16(b[27], b[26]);
+  a[28] = vaddq_s16(b[28], b[29]);
+  a[29] = vsubq_s16(b[28], b[29]);
+  a[30] = vsubq_s16(b[31], b[30]);
+  a[31] = vaddq_s16(b[31], b[30]);
+
+  // Final stage.
+  out[0] = a[0];
+  out[16] = a[1];
+  out[8] = a[2];
+  out[24] = a[3];
+  out[4] = a[4];
+  out[20] = a[5];
+  out[12] = a[6];
+  out[28] = a[7];
+  out[2] = a[8];
+  out[18] = a[9];
+  out[10] = a[10];
+  out[26] = a[11];
+  out[6] = a[12];
+  out[22] = a[13];
+  out[14] = a[14];
+  out[30] = a[15];
+
+  butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+  butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+                      &out[15]);
+  butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+  butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+  butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+  butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+                      &out[11]);
+  butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+                      &out[19]);
+  butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#endif  // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
index 11df7292d4..3b9196fae9 100644
--- a/vpx_dsp/arm/fdct4x4_neon.c
+++ b/vpx_dsp/arm/fdct4x4_neon.c
@@ -18,10 +18,10 @@
 #include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
 
 void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
-  int i;
   // input[M * stride] * 16
   int16x4_t in[4];
   in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
@@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
     const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
     in[0] = vadd_s16(in[0], one);
   }
-  for (i = 0; i < 2; ++i) {
-    vpx_fdct4x4_pass1_neon(in);
-  }
+  vpx_fdct4x4_pass1_neon(in);
+  vpx_fdct4x4_pass2_neon(in);
   {
     // Not quite a rounding shift. Only add 1 despite shifting by 2.
     const int16x8_t one = vdupq_n_s16(1);
@@ -53,7 +52,6 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
 
 void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                              int stride) {
-  int i;
   static const int32x4_t const_1000 = { 1, 0, 0, 0 };
   const int32x4_t const_one = vdupq_n_s32(1);
 
@@ -69,9 +67,8 @@ void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
     in[0] = vaddq_s32(in[0], const_1000);
   }
 
-  for (i = 0; i < 2; ++i) {
-    vpx_highbd_fdct4x4_pass1_neon(in);
-  }
+  vpx_highbd_fdct4x4_pass1_neon(in);
+  vpx_highbd_fdct4x4_pass1_neon(in);
   {
     // Not quite a rounding shift. Only add 1 despite shifting by 2.
     in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
diff --git a/vpx_dsp/arm/fdct4x4_neon.h b/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 0000000000..de3db9774c
--- /dev/null
+++ b/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+                                               &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+  int32x4_t out[4];
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+  const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+  const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+  const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+  butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+                                          &out[1], &out[3]);
+
+  transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
index 3fb15cc175..75ee6f2230 100644
--- a/vpx_dsp/arm/fdct8x8_neon.c
+++ b/vpx_dsp/arm/fdct8x8_neon.c
@@ -17,10 +17,10 @@
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
 
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
-  int i;
   // stage 1
   int16x8_t in[8];
   in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@@ -31,9 +31,9 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
   in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
   in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
   in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
-    vpx_fdct8x8_pass1_neon(in);
-  }  // for
+
+  vpx_fdct8x8_pass1_neon(in);
+  vpx_fdct8x8_pass2_neon(in);
   {
     // from vpx_dct_sse2.c
     // Post-condition (division by two)
@@ -71,8 +71,6 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
 
 void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                              int stride) {
-  int i;
-
   // input[M * stride] * 16
   int32x4_t left[8], right[8];
   int16x8_t in[8];
@@ -102,26 +100,25 @@ void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
   right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
   right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
 
-  for (i = 0; i < 2; ++i) {
-    vpx_highbd_fdct8x8_pass1_neon(left, right);
-  }
+  vpx_highbd_fdct8x8_pass1_neon(left, right);
+  vpx_highbd_fdct8x8_pass2_neon(left, right);
   {
-    left[0] = highbd_add_round_shift_s32(left[0]);
-    left[1] = highbd_add_round_shift_s32(left[1]);
-    left[2] = highbd_add_round_shift_s32(left[2]);
-    left[3] = highbd_add_round_shift_s32(left[3]);
-    left[4] = highbd_add_round_shift_s32(left[4]);
-    left[5] = highbd_add_round_shift_s32(left[5]);
-    left[6] = highbd_add_round_shift_s32(left[6]);
-    left[7] = highbd_add_round_shift_s32(left[7]);
-    right[0] = highbd_add_round_shift_s32(right[0]);
-    right[1] = highbd_add_round_shift_s32(right[1]);
-    right[2] = highbd_add_round_shift_s32(right[2]);
-    right[3] = highbd_add_round_shift_s32(right[3]);
-    right[4] = highbd_add_round_shift_s32(right[4]);
-    right[5] = highbd_add_round_shift_s32(right[5]);
-    right[6] = highbd_add_round_shift_s32(right[6]);
-    right[7] = highbd_add_round_shift_s32(right[7]);
+    left[0] = add_round_shift_half_s32(left[0]);
+    left[1] = add_round_shift_half_s32(left[1]);
+    left[2] = add_round_shift_half_s32(left[2]);
+    left[3] = add_round_shift_half_s32(left[3]);
+    left[4] = add_round_shift_half_s32(left[4]);
+    left[5] = add_round_shift_half_s32(left[5]);
+    left[6] = add_round_shift_half_s32(left[6]);
+    left[7] = add_round_shift_half_s32(left[7]);
+    right[0] = add_round_shift_half_s32(right[0]);
+    right[1] = add_round_shift_half_s32(right[1]);
+    right[2] = add_round_shift_half_s32(right[2]);
+    right[3] = add_round_shift_half_s32(right[3]);
+    right[4] = add_round_shift_half_s32(right[4]);
+    right[5] = add_round_shift_half_s32(right[5]);
+    right[6] = add_round_shift_half_s32(right[6]);
+    right[7] = add_round_shift_half_s32(right[7]);
 
     // store results
     vst1q_s32(final_output, left[0]);
diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 0000000000..d8fa600448
--- /dev/null
+++ b/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,381 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+                                          &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass1_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass2_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+                          &left[2], &right[2], &left[6], &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+                          &left[1], &right[1], &left[7], &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+                          &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[2], &right[2], &left[6],
+                                     &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[1], &right[1], &left[7],
+                                     &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[5], &right[5], &left[3],
+                                     &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  int32x4x2_t out[8];
+  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  left[0] = out[0].val[0];
+  right[0] = out[0].val[1];
+  left[1] = out[1].val[0];
+  right[1] = out[1].val[1];
+  left[2] = out[2].val[0];
+  right[2] = out[2].val[1];
+  left[3] = out[3].val[0];
+  right[3] = out[3].val[1];
+  left[4] = out[4].val[0];
+  right[4] = out[4].val[1];
+  left[5] = out[5].val[0];
+  right[5] = out[5].val[1];
+  left[6] = out[6].val[0];
+  right[6] = out[6].val[1];
+  left[7] = out[7].val[0];
+  right[7] = out[7].val[1];
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  int32x4x2_t out[8];
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  left[0] = out[0].val[0];
+  right[0] = out[0].val[1];
+  left[1] = out[1].val[0];
+  right[1] = out[1].val[1];
+  left[2] = out[2].val[0];
+  right[2] = out[2].val[1];
+  left[3] = out[3].val[0];
+  right[3] = out[3].val[1];
+  left[4] = out[4].val[0];
+  right[4] = out[4].val[1];
+  left[5] = out[5].val[0];
+  right[5] = out[5].val[1];
+  left[6] = out[6].val[0];
+  right[6] = out[6].val[1];
+  left[7] = out[7].val[0];
+  right[7] = out[7].val[1];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index ce669061d2..1ea948b3f7 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -14,56 +14,94 @@
 #include <arm_neon.h>
 
 // fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t constant,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+                                                     const int16x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int16x4_t *add,
+                                                     int16x4_t *sub) {
+  int16x4_t c = vdup_n_s16(2 * constant);
+  *add = vqrdmulh_s16(vadd_s16(a, b), c);
+  *sub = vqrdmulh_s16(vsub_s16(a, b), c);
 }
 
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t constant0,
-                                       const tran_coef_t constant1,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+                                                const int16x8_t b,
+                                                const tran_coef_t constant,
+                                                int16x8_t *add,
+                                                int16x8_t *sub) {
+  int16x8_t c = vdupq_n_s16(2 * constant);
+  *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+  *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
 }
 
-// Add 2 if positive, 1 if negative, and shift by 2.
-// In practice, subtract the sign bit, then shift with rounding.
-static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
-  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
-  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
-  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
-  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  const int16x4_t a_lo = vget_low_s16(a);
+  const int16x4_t a_hi = vget_high_s16(a);
+  const int16x4_t b_lo = vget_low_s16(b);
+  const int16x4_t b_hi = vget_high_s16(b);
+  *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+  butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+                                   &sub_hi);
+  *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+  *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
 }
 
-// Like butterfly_one_coeff, but don't narrow results.
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int32x4_t *add, int32x4_t *sub) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+  *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int16x4_t *add, int16x4_t *sub) {
+  int32x4_t add32, sub32;
+  butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+  *add = vmovn_s32(add32);
+  *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
 static INLINE void butterfly_one_coeff_s16_s32(
-    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
     int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
     int32x4_t *sub_hi) {
   const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
@@ -78,37 +116,182 @@ static INLINE void butterfly_one_coeff_s16_s32(
   *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
 }
 
-// Like butterfly_one_coeff, but with s32.
-static INLINE void butterfly_one_coeff_s32(
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+  butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+                              &sub32_hi);
+  *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+  *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+                                                     const int32x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int32x4_t *add,
+                                                     int32x4_t *sub) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+  *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
     const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
     int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
-  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
-  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
-  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
-  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+    const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+  const int32x2_t a_lo = vget_low_s32(a);
+  const int32x2_t a_hi = vget_high_s32(a);
+  const int32x2_t b_lo = vget_low_s32(b);
+  const int32x2_t b_hi = vget_high_s32(b);
+
+  const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+  const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+  const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+  const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+  const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+  const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+  const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+  const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
 }
 
-// Like butterfly_two_coeff, but with s32.
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+                                            const int16x4_t b,
+                                            const tran_coef_t constant1,
+                                            const tran_coef_t constant2,
+                                            int16x4_t *add, int16x4_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(a, constant1);
+  const int32x4_t a2 = vmull_n_s16(a, constant2);
+  const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+  const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+  *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+  *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t constant1,
+                                       const tran_coef_t constant2,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+  const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+  const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+  const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+  const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
 static INLINE void butterfly_two_coeff_s32(
     const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
-    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
-    int32x4_t *sub_hi) {
-  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
-  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
-  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
-  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
-  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
-  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
-  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
-  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+  const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+  const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+  const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
   *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
   *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
   *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
@@ -126,9 +309,10 @@ static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
 }
 
 // Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
-                                            const int32x4_t a_hi) {
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+                                                   const int32x4_t a_hi) {
   const int32x4_t one = vdupq_n_s32(1);
   const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
   const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
@@ -143,419 +327,32 @@ static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
   return vcombine_s16(b_lo, b_hi);
 }
 
-static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
-  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
-  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
-
-  // in_0 +/- in_3, in_1 +/- in_2
-  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
-  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
-  // step_0 +/- step_1, step_2 +/- step_3
-  const int16x4_t s_0 = vget_low_s16(s_01);
-  const int16x4_t s_1 = vget_high_s16(s_01);
-  const int16x4_t s_2 = vget_high_s16(s_32);
-  const int16x4_t s_3 = vget_low_s16(s_32);
-
-  // (s_0 +/- s_1) * cospi_16_64
-  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-  const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
-  const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-  const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-  const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
-  // fdct_round_shift
-  int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
-  int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
-  // s_3 * cospi_8_64 + s_2 * cospi_24_64
-  // s_3 * cospi_24_64 - s_2 * cospi_8_64
-  const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-  const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
-  const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-  const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
-  // fdct_round_shift
-  int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
-  int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
-  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
-  in[0] = out_0;
-  in[1] = out_1;
-  in[2] = out_2;
-  in[3] = out_3;
-}
-
-static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
-                                                      int16x8_t *out) {
-  const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
-  const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
-  const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
-  const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
-  const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
-  const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
-  const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
-  const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
-  // fdct4(step, step);
-  int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-  int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-  int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-  int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-  // fdct4(step, step);
-  int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-  int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-  int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-  int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-  int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
-  int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
-  int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
-  int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
-  v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
-  v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
-  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
-  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
-  v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-  v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-  v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-  v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-    out[0] = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-    out[2] = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-    out[4] = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-    out[6] = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-  }
-  // Stage 2
-  v_x0 = vsubq_s16(v_s6, v_s5);
-  v_x1 = vaddq_s16(v_s6, v_s5);
-  v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
-  v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
-  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
-  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x8_t ab = vcombine_s16(a, b);
-    const int16x8_t cd = vcombine_s16(c, d);
-    // Stage 3
-    v_x0 = vaddq_s16(v_s4, ab);
-    v_x1 = vsubq_s16(v_s4, ab);
-    v_x2 = vsubq_s16(v_s7, cd);
-    v_x3 = vaddq_s16(v_s7, cd);
-  }
-  // Stage 4
-  v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
-  v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
-  v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
-  v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
-  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
-  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
-  v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
-  v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
-  v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
-  v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
-  v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
-  v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
-  v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
-  v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
-  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
-  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-    out[1] = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-    out[3] = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-    out[5] = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-    out[7] = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-  }
-}
-
-static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
-  int16x8_t out[8];
-  vpx_fdct8x8_pass1_notranspose_neon(in, out);
-  // transpose 8x8
-  // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
-  // columns.
-  {
-    // 00 01 02 03 40 41 42 43
-    // 10 11 12 13 50 51 52 53
-    // 20 21 22 23 60 61 62 63
-    // 30 31 32 33 70 71 72 73
-    // 04 05 06 07 44 45 46 47
-    // 14 15 16 17 54 55 56 57
-    // 24 25 26 27 64 65 66 67
-    // 34 35 36 37 74 75 76 77
-    const int32x4x2_t r02_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
-    const int32x4x2_t r13_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
-    const int32x4x2_t r46_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
-    const int32x4x2_t r57_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
-    const int16x8x2_t r01_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                  vreinterpretq_s16_s32(r13_s32.val[0]));
-    const int16x8x2_t r23_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                  vreinterpretq_s16_s32(r13_s32.val[1]));
-    const int16x8x2_t r45_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                  vreinterpretq_s16_s32(r57_s32.val[0]));
-    const int16x8x2_t r67_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                  vreinterpretq_s16_s32(r57_s32.val[1]));
-    in[0] = r01_s16.val[0];
-    in[1] = r01_s16.val[1];
-    in[2] = r23_s16.val[0];
-    in[3] = r23_s16.val[1];
-    in[4] = r45_s16.val[0];
-    in[5] = r45_s16.val[1];
-    in[6] = r67_s16.val[0];
-    in[7] = r67_s16.val[1];
-    // 00 10 20 30 40 50 60 70
-    // 01 11 21 31 41 51 61 71
-    // 02 12 22 32 42 52 62 72
-    // 03 13 23 33 43 53 63 73
-    // 04 14 24 34 44 54 64 74
-    // 05 15 25 35 45 55 65 75
-    // 06 16 26 36 46 56 66 76
-    // 07 17 27 37 47 57 67 77
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) {
-  const int32x2_t x_lo = vget_low_s32(x);
-  const int32x2_t x_hi = vget_high_s32(x);
-  const int64x2_t x64_lo = vmovl_s32(x_lo);
-  const int64x2_t x64_hi = vmovl_s32(x_hi);
-
-  const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63);
-  const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63);
-
-  const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo);
-  const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi);
-  return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1));
-}
-
-static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a,
-                                                  const int32x4_t b,
-                                                  const tran_coef_t c,
-                                                  int32x4_t *add,
-                                                  int32x4_t *sub) {
-  const int32x2_t a_lo = vget_low_s32(a);
-  const int32x2_t a_hi = vget_high_s32(a);
-  const int32x2_t b_lo = vget_low_s32(b);
-  const int32x2_t b_hi = vget_high_s32(b);
-
-  const int64x2_t a64_lo = vmull_n_s32(a_lo, c);
-  const int64x2_t a64_hi = vmull_n_s32(a_hi, c);
-
-  const int64x2_t sum_lo = vmlal_n_s32(a64_lo, b_lo, c);
-  const int64x2_t sum_hi = vmlal_n_s32(a64_hi, b_hi, c);
-  const int64x2_t diff_lo = vmlsl_n_s32(a64_lo, b_lo, c);
-  const int64x2_t diff_hi = vmlsl_n_s32(a64_hi, b_hi, c);
-
-  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
-                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
-  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
-                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
-}
-
-static INLINE void highbd_butterfly_two_coeff_s32(
-    const int32x4_t a, const int32x4_t b, const tran_coef_t c0,
-    const tran_coef_t c1, int32x4_t *add, int32x4_t *sub) {
-  const int32x2_t a_lo = vget_low_s32(a);
-  const int32x2_t a_hi = vget_high_s32(a);
-  const int32x2_t b_lo = vget_low_s32(b);
-  const int32x2_t b_hi = vget_high_s32(b);
-
-  const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, c0);
-  const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, c0);
-  const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, c1);
-  const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, c1);
-
-  const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, c1);
-  const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, c1);
-  const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, c0);
-  const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, c0);
-
-  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
-                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
-  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
-                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
-}
-
-static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
-  int32x4_t out[4];
-  // in_0 +/- in_3, in_1 +/- in_2
-  const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
-  const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
-  const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
-  const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
-
-  highbd_butterfly_one_coeff_s32(s_0, s_1, cospi_16_64, &out[0], &out[2]);
-
-  // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
-  // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
-  highbd_butterfly_two_coeff_s32(s_3, s_2, cospi_8_64, cospi_24_64, &out[1],
-                                 &out[3]);
-
-  transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
-
-  in[0] = out[0];
-  in[1] = out[1];
-  in[2] = out[2];
-  in[3] = out[3];
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
 }
 
-static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
-                                                             int32x4_t *right) {
-  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
-
-  sl[0] = vaddq_s32(left[0], left[7]);
-  sl[1] = vaddq_s32(left[1], left[6]);
-  sl[2] = vaddq_s32(left[2], left[5]);
-  sl[3] = vaddq_s32(left[3], left[4]);
-  sl[4] = vsubq_s32(left[3], left[4]);
-  sl[5] = vsubq_s32(left[2], left[5]);
-  sl[6] = vsubq_s32(left[1], left[6]);
-  sl[7] = vsubq_s32(left[0], left[7]);
-  sr[0] = vaddq_s32(right[0], right[7]);
-  sr[1] = vaddq_s32(right[1], right[6]);
-  sr[2] = vaddq_s32(right[2], right[5]);
-  sr[3] = vaddq_s32(right[3], right[4]);
-  sr[4] = vsubq_s32(right[3], right[4]);
-  sr[5] = vsubq_s32(right[2], right[5]);
-  sr[6] = vsubq_s32(right[1], right[6]);
-  sr[7] = vsubq_s32(right[0], right[7]);
-
-  // fdct4(step, step);
-  // x0 = s0 + s3;
-  xl[0] = vaddq_s32(sl[0], sl[3]);
-  xr[0] = vaddq_s32(sr[0], sr[3]);
-  // x1 = s1 + s2;
-  xl[1] = vaddq_s32(sl[1], sl[2]);
-  xr[1] = vaddq_s32(sr[1], sr[2]);
-  // x2 = s1 - s2;
-  xl[2] = vsubq_s32(sl[1], sl[2]);
-  xr[2] = vsubq_s32(sr[1], sr[2]);
-  // x3 = s0 - s3;
-  xl[3] = vsubq_s32(sl[0], sl[3]);
-  xr[3] = vsubq_s32(sr[0], sr[3]);
-
-  // fdct4(step, step);
-  // t0 = (x0 + x1) * cospi_16_64;
-  // t1 = (x0 - x1) * cospi_16_64;
-  // out[0] = (tran_low_t)fdct_round_shift(t0);
-  // out[4] = (tran_low_t)fdct_round_shift(t1);
-  highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]);
-  highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
-                                 &right[4]);
-  // t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
-  // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-  // out[2] = (tran_low_t)fdct_round_shift(t2);
-  // out[6] = (tran_low_t)fdct_round_shift(t3);
-  highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
-                                 &left[2], &left[6]);
-  highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
-                                 &right[2], &right[6]);
-
-  // Stage 2
-  // t0 = (s6 - s5) * cospi_16_64;
-  highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]);
-  highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]);
-
-  // Stage 3
-  xl[0] = vaddq_s32(sl[4], tl[0]);
-  xr[0] = vaddq_s32(sr[4], tr[0]);
-  xl[1] = vsubq_s32(sl[4], tl[0]);
-  xr[1] = vsubq_s32(sr[4], tr[0]);
-  xl[2] = vsubq_s32(sl[7], tl[1]);
-  xr[2] = vsubq_s32(sr[7], tr[1]);
-  xl[3] = vaddq_s32(sl[7], tl[1]);
-  xr[3] = vaddq_s32(sr[7], tr[1]);
-
-  // Stage 4
-  // t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-  // out[1] = (tran_low_t)fdct_round_shift(t0);
-  // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-  // out[7] = (tran_low_t)fdct_round_shift(t3);
-  highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
-                                 &left[1], &left[7]);
-  highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
-                                 &right[1], &right[7]);
-
-  // t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-  // out[5] = (tran_low_t)fdct_round_shift(t1);
-  // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-  // out[3] = (tran_low_t)fdct_round_shift(t2);
-  highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
-                                 &left[5], &left[3]);
-  highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
-                                 &right[5], &right[3]);
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
 }
 
-static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
-                                                 int32x4_t *right) {
-  int32x4x2_t out[8];
-  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
-
-  out[0].val[0] = left[0];
-  out[0].val[1] = right[0];
-  out[1].val[0] = left[1];
-  out[1].val[1] = right[1];
-  out[2].val[0] = left[2];
-  out[2].val[1] = right[2];
-  out[3].val[0] = left[3];
-  out[3].val[1] = right[3];
-  out[4].val[0] = left[4];
-  out[4].val[1] = right[4];
-  out[5].val[0] = left[5];
-  out[5].val[1] = right[5];
-  out[6].val[0] = left[6];
-  out[6].val[1] = right[6];
-  out[7].val[0] = left[7];
-  out[7].val[1] = right[7];
-
-  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
-                    &out[6], &out[7]);
-
-  left[0] = out[0].val[0];
-  right[0] = out[0].val[1];
-  left[1] = out[1].val[0];
-  right[1] = out[1].val[1];
-  left[2] = out[2].val[0];
-  right[2] = out[2].val[1];
-  left[3] = out[3].val[0];
-  right[3] = out[3].val[1];
-  left[4] = out[4].val[0];
-  right[4] = out[4].val[1];
-  left[5] = out[5].val[0];
-  right[5] = out[5].val[1];
-  left[6] = out[6].val[0];
-  right[6] = out[6].val[1];
-  left[7] = out[7].val[0];
-  right[7] = out[7].val[1];
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
 }
 
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index bf06d6abe2..41d44f2b1f 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -821,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
   a7->val[1] = c7.val[1];
 }
 
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+                                       int32x4_t *right /*[8]*/,
+                                       int32x4_t *out_left /*[8]*/,
+                                       int32x4_t *out_right /*[8]*/) {
+  int32x4x2_t out[8];
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  out_left[0] = out[0].val[0];
+  out_left[1] = out[1].val[0];
+  out_left[2] = out[2].val[0];
+  out_left[3] = out[3].val[0];
+  out_left[4] = out[4].val[0];
+  out_left[5] = out[5].val[0];
+  out_left[6] = out[6].val[0];
+  out_left[7] = out[7].val[0];
+  out_right[0] = out[0].val[1];
+  out_right[1] = out[1].val[1];
+  out_right[2] = out[2].val[1];
+  out_right[3] = out[3].val[1];
+  out_right[4] = out[4].val[1];
+  out_right[5] = out[5].val[1];
+  out_right[6] = out[6].val[1];
+  out_right[7] = out[7].val[1];
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,

From 3f08aa0d0b2828b670073f808ae079acb35902a4 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 26 Oct 2022 22:09:32 +0000
Subject: [PATCH 468/926] [NEON] Optimize highbd 32x32 DCT

For --best quality, resulting function
vpx_highbd_fdct32x32_rd_neon takes 0.27% of cpu time in
profiling, vs 6.27% for the sum of scalar functions:
vpx_fdct32, vpx_fdct32.constprop.0, vpx_fdct32x32_rd_c for rd.
For --rt quality, the function takes 0.19% vs 4.57% for the scalar
version.
Overall, this improves encoding time by ~6% compared for highbd
for --best and ~9% for --rt.

Change-Id: I1ce4bbef6e364bbadc76264056aa3f86b1a8edc5
---
 vpx_dsp/arm/fdct32x32_neon.c |  185 ++++
 vpx_dsp/arm/fdct32x32_neon.h | 1820 +++++++++++++++++++++++++++++++++-
 vpx_dsp/arm/fdct_neon.h      |    9 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl |    4 +-
 4 files changed, 2013 insertions(+), 5 deletions(-)

diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index e2bf167604..d6818d2ec6 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -230,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
                     &temp5[29], &temp5[30], &temp5[31]);
   store(output + 24 * 32, temp5);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  int16x8_t temp0[32];
+  int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+      right3[32], right4[32];
+  int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+      left8[32], right8[32];
+  int32x4_t temp1[32], temp2[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  highbd_dct8x32_body_first_pass(left1, right1);
+  highbd_partial_sub_round_shift(left1, right1);
+
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  highbd_dct8x32_body_first_pass(left2, right2);
+  highbd_partial_sub_round_shift(left2, right2);
+
+  load_cross(input + 16, stride, temp0);
+  highbd_scale_input(temp0, left3, right3);
+  highbd_dct8x32_body_first_pass(left3, right3);
+  highbd_partial_sub_round_shift(left3, right3);
+
+  load_cross(input + 24, stride, temp0);
+  highbd_scale_input(temp0, left4, right4);
+  highbd_dct8x32_body_first_pass(left4, right4);
+  highbd_partial_sub_round_shift(left4, right4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s32_8x8_2(left1, right1, temp1, temp2);
+  transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left5, right5);
+  highbd_dct8x32_body_second_pass(left5, right5);
+  highbd_partial_add_round_shift(left5, right5);
+
+  // Second row of 8x32.
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left6, right6);
+  highbd_dct8x32_body_second_pass(left6, right6);
+  highbd_partial_add_round_shift(left6, right6);
+
+  // Third row of 8x32
+  transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left7, right7);
+  highbd_dct8x32_body_second_pass(left7, right7);
+  highbd_partial_add_round_shift(left7, right7);
+
+  // Final row of 8x32.
+  transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left8, right8);
+  highbd_dct8x32_body_second_pass(left8, right8);
+  highbd_partial_add_round_shift(left8, right8);
+
+  // Final transpose
+  transpose_s32_8x8_2(left5, right5, left1, right1);
+  transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+  transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+  transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+  transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+  transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+  transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+  transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+  transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+  transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+  transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+  transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+  transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+  transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+  store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+                 right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                                  int stride) {
+  int16x8_t temp0[32];
+  int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+      right3[32], right4[32];
+  int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+      left8[32], right8[32];
+  int32x4_t temp1[32], temp2[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  highbd_dct8x32_body_first_pass(left1, right1);
+  highbd_partial_sub_round_shift(left1, right1);
+
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  highbd_dct8x32_body_first_pass(left2, right2);
+  highbd_partial_sub_round_shift(left2, right2);
+
+  load_cross(input + 16, stride, temp0);
+  highbd_scale_input(temp0, left3, right3);
+  highbd_dct8x32_body_first_pass(left3, right3);
+  highbd_partial_sub_round_shift(left3, right3);
+
+  load_cross(input + 24, stride, temp0);
+  highbd_scale_input(temp0, left4, right4);
+  highbd_dct8x32_body_first_pass(left4, right4);
+  highbd_partial_sub_round_shift(left4, right4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s32_8x8_2(left1, right1, temp1, temp2);
+  transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left5, right5);
+  highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+  // Second row of 8x32.
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left6, right6);
+  highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+  // Third row of 8x32
+  transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left7, right7);
+  highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+  // Final row of 8x32.
+  transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left8, right8);
+  highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+  // Final transpose
+  transpose_s32_8x8_2(left5, right5, left1, right1);
+  transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+  transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+  transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+  transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+  transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+  transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+  transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+  transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+  transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+  transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+  transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+  transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+  transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+  store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+                 right4);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
         // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h
index dd647918b2..3b9e64c6df 100644
--- a/vpx_dsp/arm/fdct32x32_neon.h
+++ b/vpx_dsp/arm/fdct32x32_neon.h
@@ -143,7 +143,7 @@ static INLINE void scale_input(const int16x8_t *in /*32*/,
   out[31] = vshlq_n_s16(in[31], 2);
 }
 
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
   int16x8_t a[32];
   int16x8_t b[32];
 
@@ -494,7 +494,7 @@ static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
                             &b##_hi[sub_index]);                               \
   } while (0)
 
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
   int16x8_t a[32];
   int16x8_t b[32];
   int32x4_t c_lo[32];
@@ -800,7 +800,8 @@ static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
   out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
 }
 
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+                                           int16x8_t *out) {
   int16x8_t a[32];
   int16x8_t b[32];
 
@@ -1102,4 +1103,1817 @@ static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
 #undef BUTTERFLY_ONE_S32
 #undef BUTTERFLY_TWO_S32
 
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+    tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+    const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+    const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+    const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+  int i;
+  for (i = 0; i < 32; i++) {
+    vst1q_s32(a, l1[i]);
+    vst1q_s32(a + 4, r1[i]);
+    vst1q_s32(a + 8, l2[i]);
+    vst1q_s32(a + 12, r2[i]);
+    vst1q_s32(a + 16, l3[i]);
+    vst1q_s32(a + 20, r3[i]);
+    vst1q_s32(a + 24, l4[i]);
+    vst1q_s32(a + 28, r4[i]);
+    a += 32;
+  }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+                                      int32x4_t *left /*[32]*/,
+                                      int32x4_t *right /* [32] */) {
+  left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+  left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+  left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+  left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+  left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+  left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+  left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+  left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+  left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+  left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+  left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+  left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+  left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+  left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+  left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+  left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+  left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+  left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+  left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+  left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+  left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+  left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+  left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+  left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+  left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+  right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+  right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+  right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+  right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+  right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+  right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+  right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+  right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+  right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+  right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+  right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+  right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+  right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+  right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+  right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+  right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+  right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+                                      int32x4_t *a_right /*[32]*/,
+                                      int32x4_t *b_left /*[32]*/,
+                                      int32x4_t *b_right /*[32]*/) {
+  // Stage 1. Done as part of the load for the first pass.
+  b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+  b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+  b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+  b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+  b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+  b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+  b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+  b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+  b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+  b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+  b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+  b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+  b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+  b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+  b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+  b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+  b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+  b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+  b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+  b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+  b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+  b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+  b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+  b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+  b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+  b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+  b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+  b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+  b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+  b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+  b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+  b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+  b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+  b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+  b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+  b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+  b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+  b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+  b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+  b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+  b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+  b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+  b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+  b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+  b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+  b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+  b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+  b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+  b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+  b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+  b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+  b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+  b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+  b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+  b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+  b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+  b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+  b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+  b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+  b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+  b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+  b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+  b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+  b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+                                                  int32x4_t *right /* [32] */) {
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+  left[0] = add_round_shift_s32(left[0]);
+  left[1] = add_round_shift_s32(left[1]);
+  left[2] = add_round_shift_s32(left[2]);
+  left[3] = add_round_shift_s32(left[3]);
+  left[4] = add_round_shift_s32(left[4]);
+  left[5] = add_round_shift_s32(left[5]);
+  left[6] = add_round_shift_s32(left[6]);
+  left[7] = add_round_shift_s32(left[7]);
+  left[8] = add_round_shift_s32(left[8]);
+  left[9] = add_round_shift_s32(left[9]);
+  left[10] = add_round_shift_s32(left[10]);
+  left[11] = add_round_shift_s32(left[11]);
+  left[12] = add_round_shift_s32(left[12]);
+  left[13] = add_round_shift_s32(left[13]);
+  left[14] = add_round_shift_s32(left[14]);
+  left[15] = add_round_shift_s32(left[15]);
+  left[16] = add_round_shift_s32(left[16]);
+  left[17] = add_round_shift_s32(left[17]);
+  left[18] = add_round_shift_s32(left[18]);
+  left[19] = add_round_shift_s32(left[19]);
+  left[20] = add_round_shift_s32(left[20]);
+  left[21] = add_round_shift_s32(left[21]);
+  left[22] = add_round_shift_s32(left[22]);
+  left[23] = add_round_shift_s32(left[23]);
+  left[24] = add_round_shift_s32(left[24]);
+  left[25] = add_round_shift_s32(left[25]);
+  left[26] = add_round_shift_s32(left[26]);
+  left[27] = add_round_shift_s32(left[27]);
+  left[28] = add_round_shift_s32(left[28]);
+  left[29] = add_round_shift_s32(left[29]);
+  left[30] = add_round_shift_s32(left[30]);
+  left[31] = add_round_shift_s32(left[31]);
+
+  right[0] = add_round_shift_s32(right[0]);
+  right[1] = add_round_shift_s32(right[1]);
+  right[2] = add_round_shift_s32(right[2]);
+  right[3] = add_round_shift_s32(right[3]);
+  right[4] = add_round_shift_s32(right[4]);
+  right[5] = add_round_shift_s32(right[5]);
+  right[6] = add_round_shift_s32(right[6]);
+  right[7] = add_round_shift_s32(right[7]);
+  right[8] = add_round_shift_s32(right[8]);
+  right[9] = add_round_shift_s32(right[9]);
+  right[10] = add_round_shift_s32(right[10]);
+  right[11] = add_round_shift_s32(right[11]);
+  right[12] = add_round_shift_s32(right[12]);
+  right[13] = add_round_shift_s32(right[13]);
+  right[14] = add_round_shift_s32(right[14]);
+  right[15] = add_round_shift_s32(right[15]);
+  right[16] = add_round_shift_s32(right[16]);
+  right[17] = add_round_shift_s32(right[17]);
+  right[18] = add_round_shift_s32(right[18]);
+  right[19] = add_round_shift_s32(right[19]);
+  right[20] = add_round_shift_s32(right[20]);
+  right[21] = add_round_shift_s32(right[21]);
+  right[22] = add_round_shift_s32(right[22]);
+  right[23] = add_round_shift_s32(right[23]);
+  right[24] = add_round_shift_s32(right[24]);
+  right[25] = add_round_shift_s32(right[25]);
+  right[26] = add_round_shift_s32(right[26]);
+  right[27] = add_round_shift_s32(right[27]);
+  right[28] = add_round_shift_s32(right[28]);
+  right[29] = add_round_shift_s32(right[29]);
+  right[30] = add_round_shift_s32(right[30]);
+  right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+                                                  int32x4_t *right /* [32] */) {
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+  left[0] = sub_round_shift_s32(left[0]);
+  left[1] = sub_round_shift_s32(left[1]);
+  left[2] = sub_round_shift_s32(left[2]);
+  left[3] = sub_round_shift_s32(left[3]);
+  left[4] = sub_round_shift_s32(left[4]);
+  left[5] = sub_round_shift_s32(left[5]);
+  left[6] = sub_round_shift_s32(left[6]);
+  left[7] = sub_round_shift_s32(left[7]);
+  left[8] = sub_round_shift_s32(left[8]);
+  left[9] = sub_round_shift_s32(left[9]);
+  left[10] = sub_round_shift_s32(left[10]);
+  left[11] = sub_round_shift_s32(left[11]);
+  left[12] = sub_round_shift_s32(left[12]);
+  left[13] = sub_round_shift_s32(left[13]);
+  left[14] = sub_round_shift_s32(left[14]);
+  left[15] = sub_round_shift_s32(left[15]);
+  left[16] = sub_round_shift_s32(left[16]);
+  left[17] = sub_round_shift_s32(left[17]);
+  left[18] = sub_round_shift_s32(left[18]);
+  left[19] = sub_round_shift_s32(left[19]);
+  left[20] = sub_round_shift_s32(left[20]);
+  left[21] = sub_round_shift_s32(left[21]);
+  left[22] = sub_round_shift_s32(left[22]);
+  left[23] = sub_round_shift_s32(left[23]);
+  left[24] = sub_round_shift_s32(left[24]);
+  left[25] = sub_round_shift_s32(left[25]);
+  left[26] = sub_round_shift_s32(left[26]);
+  left[27] = sub_round_shift_s32(left[27]);
+  left[28] = sub_round_shift_s32(left[28]);
+  left[29] = sub_round_shift_s32(left[29]);
+  left[30] = sub_round_shift_s32(left[30]);
+  left[31] = sub_round_shift_s32(left[31]);
+
+  right[0] = sub_round_shift_s32(right[0]);
+  right[1] = sub_round_shift_s32(right[1]);
+  right[2] = sub_round_shift_s32(right[2]);
+  right[3] = sub_round_shift_s32(right[3]);
+  right[4] = sub_round_shift_s32(right[4]);
+  right[5] = sub_round_shift_s32(right[5]);
+  right[6] = sub_round_shift_s32(right[6]);
+  right[7] = sub_round_shift_s32(right[7]);
+  right[8] = sub_round_shift_s32(right[8]);
+  right[9] = sub_round_shift_s32(right[9]);
+  right[10] = sub_round_shift_s32(right[10]);
+  right[11] = sub_round_shift_s32(right[11]);
+  right[12] = sub_round_shift_s32(right[12]);
+  right[13] = sub_round_shift_s32(right[13]);
+  right[14] = sub_round_shift_s32(right[14]);
+  right[15] = sub_round_shift_s32(right[15]);
+  right[16] = sub_round_shift_s32(right[16]);
+  right[17] = sub_round_shift_s32(right[17]);
+  right[18] = sub_round_shift_s32(right[18]);
+  right[19] = sub_round_shift_s32(right[19]);
+  right[20] = sub_round_shift_s32(right[20]);
+  right[21] = sub_round_shift_s32(right[21]);
+  right[22] = sub_round_shift_s32(right[22]);
+  right[23] = sub_round_shift_s32(right[23]);
+  right[24] = sub_round_shift_s32(right[24]);
+  right[25] = sub_round_shift_s32(right[25]);
+  right[26] = sub_round_shift_s32(right[26]);
+  right[27] = sub_round_shift_s32(right[27]);
+  right[28] = sub_round_shift_s32(right[28]);
+  right[29] = sub_round_shift_s32(right[29]);
+  right[30] = sub_round_shift_s32(right[30]);
+  right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+                                                  int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  al[0] = vaddq_s32(left[0], left[15]);
+  ar[0] = vaddq_s32(right[0], right[15]);
+  al[1] = vaddq_s32(left[1], left[14]);
+  ar[1] = vaddq_s32(right[1], right[14]);
+  al[2] = vaddq_s32(left[2], left[13]);
+  ar[2] = vaddq_s32(right[2], right[13]);
+  al[3] = vaddq_s32(left[3], left[12]);
+  ar[3] = vaddq_s32(right[3], right[12]);
+  al[4] = vaddq_s32(left[4], left[11]);
+  ar[4] = vaddq_s32(right[4], right[11]);
+  al[5] = vaddq_s32(left[5], left[10]);
+  ar[5] = vaddq_s32(right[5], right[10]);
+  al[6] = vaddq_s32(left[6], left[9]);
+  ar[6] = vaddq_s32(right[6], right[9]);
+  al[7] = vaddq_s32(left[7], left[8]);
+  ar[7] = vaddq_s32(right[7], right[8]);
+
+  al[8] = vsubq_s32(left[7], left[8]);
+  ar[8] = vsubq_s32(right[7], right[8]);
+  al[9] = vsubq_s32(left[6], left[9]);
+  ar[9] = vsubq_s32(right[6], right[9]);
+  al[10] = vsubq_s32(left[5], left[10]);
+  ar[10] = vsubq_s32(right[5], right[10]);
+  al[11] = vsubq_s32(left[4], left[11]);
+  ar[11] = vsubq_s32(right[4], right[11]);
+  al[12] = vsubq_s32(left[3], left[12]);
+  ar[12] = vsubq_s32(right[3], right[12]);
+  al[13] = vsubq_s32(left[2], left[13]);
+  ar[13] = vsubq_s32(right[2], right[13]);
+  al[14] = vsubq_s32(left[1], left[14]);
+  ar[14] = vsubq_s32(right[1], right[14]);
+  al[15] = vsubq_s32(left[0], left[15]);
+  ar[15] = vsubq_s32(right[0], right[15]);
+
+  al[16] = left[16];
+  ar[16] = right[16];
+  al[17] = left[17];
+  ar[17] = right[17];
+  al[18] = left[18];
+  ar[18] = right[18];
+  al[19] = left[19];
+  ar[19] = right[19];
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[28] = left[28];
+  ar[28] = right[28];
+  al[29] = left[29];
+  ar[29] = right[29];
+  al[30] = left[30];
+  ar[30] = right[30];
+  al[31] = left[31];
+  ar[31] = right[31];
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(left[16], al[23]);
+  br[16] = vaddq_s32(right[16], ar[23]);
+  bl[17] = vaddq_s32(left[17], al[22]);
+  br[17] = vaddq_s32(right[17], ar[22]);
+  bl[18] = vaddq_s32(left[18], al[21]);
+  br[18] = vaddq_s32(right[18], ar[21]);
+  bl[19] = vaddq_s32(left[19], al[20]);
+  br[19] = vaddq_s32(right[19], ar[20]);
+
+  bl[20] = vsubq_s32(left[19], al[20]);
+  br[20] = vsubq_s32(right[19], ar[20]);
+  bl[21] = vsubq_s32(left[18], al[21]);
+  br[21] = vsubq_s32(right[18], ar[21]);
+  bl[22] = vsubq_s32(left[17], al[22]);
+  br[22] = vsubq_s32(right[17], ar[22]);
+  bl[23] = vsubq_s32(left[16], al[23]);
+  br[23] = vsubq_s32(right[16], ar[23]);
+
+  bl[24] = vsubq_s32(left[31], al[24]);
+  br[24] = vsubq_s32(right[31], ar[24]);
+  bl[25] = vsubq_s32(left[30], al[25]);
+  br[25] = vsubq_s32(right[30], ar[25]);
+  bl[26] = vsubq_s32(left[29], al[26]);
+  br[26] = vsubq_s32(right[29], ar[26]);
+  bl[27] = vsubq_s32(left[28], al[27]);
+  br[27] = vsubq_s32(right[28], ar[27]);
+
+  bl[28] = vaddq_s32(left[28], al[27]);
+  br[28] = vaddq_s32(right[28], ar[27]);
+  bl[29] = vaddq_s32(left[29], al[26]);
+  br[29] = vaddq_s32(right[29], ar[26]);
+  bl[30] = vaddq_s32(left[30], al[25]);
+  br[30] = vaddq_s32(right[30], ar[25]);
+  bl[31] = vaddq_s32(left[31], al[24]);
+  br[31] = vaddq_s32(right[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                                     cospi_24_64, &al[29], &ar[29], &al[18],
+                                     &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                                     cospi_24_64, &al[28], &ar[28], &al[19],
+                                     &ar[19]);
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+                                     cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+                                     &al[20], &ar[20]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+                                     cospi_24_64, &bl[2], &br[2], &bl[3],
+                                     &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+                                     cospi_24_64, &bl[14], &br[14], &bl[9],
+                                     &br[9]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+                                     &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+                                     cospi_28_64, &al[4], &ar[4], &al[7],
+                                     &ar[7]);
+  butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+                                     cospi_12_64, &al[5], &ar[5], &al[6],
+                                     &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                                     cospi_28_64, &al[30], &ar[30], &al[17],
+                                     &ar[17]);
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+                                     cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+                                     &al[18], &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_20_64, cospi_12_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_12_64, -cospi_20_64, &al[25],
+                                     &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+                                     cospi_30_64, &bl[8], &br[8], &bl[15],
+                                     &br[15]);
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                                     cospi_14_64, &bl[9], &br[9], &bl[14],
+                                     &br[14]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_10_64, cospi_22_64, &bl[10], &br[10],
+                                     &bl[13], &br[13]);
+  butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+                                     cospi_26_64, cospi_6_64, &bl[11], &br[11],
+                                     &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                                     cospi_31_64, &al[1], &ar[1], &al[31],
+                                     &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+                                     cospi_17_64, cospi_15_64, &al[17], &ar[17],
+                                     &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                                     cospi_23_64, &al[9], &ar[9], &al[23],
+                                     &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+                                     cospi_25_64, cospi_7_64, &al[25], &ar[25],
+                                     &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                                     cospi_27_64, &al[5], &ar[5], &al[27],
+                                     &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_21_64, cospi_11_64, &al[21], &ar[21],
+                                     &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_13_64, cospi_19_64, &al[13], &ar[13],
+                                     &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+                                     cospi_29_64, cospi_3_64, &al[29], &ar[29],
+                                     &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+                                                   int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  al[0] = vaddq_s32(left[0], left[15]);
+  ar[0] = vaddq_s32(right[0], right[15]);
+  al[1] = vaddq_s32(left[1], left[14]);
+  ar[1] = vaddq_s32(right[1], right[14]);
+  al[2] = vaddq_s32(left[2], left[13]);
+  ar[2] = vaddq_s32(right[2], right[13]);
+  al[3] = vaddq_s32(left[3], left[12]);
+  ar[3] = vaddq_s32(right[3], right[12]);
+  al[4] = vaddq_s32(left[4], left[11]);
+  ar[4] = vaddq_s32(right[4], right[11]);
+  al[5] = vaddq_s32(left[5], left[10]);
+  ar[5] = vaddq_s32(right[5], right[10]);
+  al[6] = vaddq_s32(left[6], left[9]);
+  ar[6] = vaddq_s32(right[6], right[9]);
+  al[7] = vaddq_s32(left[7], left[8]);
+  ar[7] = vaddq_s32(right[7], right[8]);
+
+  al[8] = vsubq_s32(left[7], left[8]);
+  ar[8] = vsubq_s32(right[7], right[8]);
+  al[9] = vsubq_s32(left[6], left[9]);
+  ar[9] = vsubq_s32(right[6], right[9]);
+  al[10] = vsubq_s32(left[5], left[10]);
+  ar[10] = vsubq_s32(right[5], right[10]);
+  al[11] = vsubq_s32(left[4], left[11]);
+  ar[11] = vsubq_s32(right[4], right[11]);
+  al[12] = vsubq_s32(left[3], left[12]);
+  ar[12] = vsubq_s32(right[3], right[12]);
+  al[13] = vsubq_s32(left[2], left[13]);
+  ar[13] = vsubq_s32(right[2], right[13]);
+  al[14] = vsubq_s32(left[1], left[14]);
+  ar[14] = vsubq_s32(right[1], right[14]);
+  al[15] = vsubq_s32(left[0], left[15]);
+  ar[15] = vsubq_s32(right[0], right[15]);
+
+  al[16] = left[16];
+  ar[16] = right[16];
+  al[17] = left[17];
+  ar[17] = right[17];
+  al[18] = left[18];
+  ar[18] = right[18];
+  al[19] = left[19];
+  ar[19] = right[19];
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[28] = left[28];
+  ar[28] = right[28];
+  al[29] = left[29];
+  ar[29] = right[29];
+  al[30] = left[30];
+  ar[30] = right[30];
+  al[31] = left[31];
+  ar[31] = right[31];
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(left[16], al[23]);
+  br[16] = vaddq_s32(right[16], ar[23]);
+  bl[17] = vaddq_s32(left[17], al[22]);
+  br[17] = vaddq_s32(right[17], ar[22]);
+  bl[18] = vaddq_s32(left[18], al[21]);
+  br[18] = vaddq_s32(right[18], ar[21]);
+  bl[19] = vaddq_s32(left[19], al[20]);
+  br[19] = vaddq_s32(right[19], ar[20]);
+
+  bl[20] = vsubq_s32(left[19], al[20]);
+  br[20] = vsubq_s32(right[19], ar[20]);
+  bl[21] = vsubq_s32(left[18], al[21]);
+  br[21] = vsubq_s32(right[18], ar[21]);
+  bl[22] = vsubq_s32(left[17], al[22]);
+  br[22] = vsubq_s32(right[17], ar[22]);
+  bl[23] = vsubq_s32(left[16], al[23]);
+  br[23] = vsubq_s32(right[16], ar[23]);
+
+  bl[24] = vsubq_s32(left[31], al[24]);
+  br[24] = vsubq_s32(right[31], ar[24]);
+  bl[25] = vsubq_s32(left[30], al[25]);
+  br[25] = vsubq_s32(right[30], ar[25]);
+  bl[26] = vsubq_s32(left[29], al[26]);
+  br[26] = vsubq_s32(right[29], ar[26]);
+  bl[27] = vsubq_s32(left[28], al[27]);
+  br[27] = vsubq_s32(right[28], ar[27]);
+
+  bl[28] = vaddq_s32(left[28], al[27]);
+  br[28] = vaddq_s32(right[28], ar[27]);
+  bl[29] = vaddq_s32(left[29], al[26]);
+  br[29] = vaddq_s32(right[29], ar[26]);
+  bl[30] = vaddq_s32(left[30], al[25]);
+  br[30] = vaddq_s32(right[30], ar[25]);
+  bl[31] = vaddq_s32(left[31], al[24]);
+  br[31] = vaddq_s32(right[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                                     cospi_24_64, &al[29], &ar[29], &al[18],
+                                     &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                                     cospi_24_64, &al[28], &ar[28], &al[19],
+                                     &ar[19]);
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+                                     cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+                                     &al[20], &ar[20]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+                                     cospi_24_64, &bl[2], &br[2], &bl[3],
+                                     &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+                                     cospi_24_64, &bl[14], &br[14], &bl[9],
+                                     &br[9]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+                                     &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+                                     cospi_28_64, &al[4], &ar[4], &al[7],
+                                     &ar[7]);
+  butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+                                     cospi_12_64, &al[5], &ar[5], &al[6],
+                                     &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                                     cospi_28_64, &al[30], &ar[30], &al[17],
+                                     &ar[17]);
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+                                     cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+                                     &al[18], &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_20_64, cospi_12_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_12_64, -cospi_20_64, &al[25],
+                                     &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+                                     cospi_30_64, &bl[8], &br[8], &bl[15],
+                                     &br[15]);
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                                     cospi_14_64, &bl[9], &br[9], &bl[14],
+                                     &br[14]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_10_64, cospi_22_64, &bl[10], &br[10],
+                                     &bl[13], &br[13]);
+  butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+                                     cospi_26_64, cospi_6_64, &bl[11], &br[11],
+                                     &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                                     cospi_31_64, &al[1], &ar[1], &al[31],
+                                     &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+                                     cospi_17_64, cospi_15_64, &al[17], &ar[17],
+                                     &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                                     cospi_23_64, &al[9], &ar[9], &al[23],
+                                     &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+                                     cospi_25_64, cospi_7_64, &al[25], &ar[25],
+                                     &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                                     cospi_27_64, &al[5], &ar[5], &al[27],
+                                     &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_21_64, cospi_11_64, &al[21], &ar[21],
+                                     &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_13_64, cospi_19_64, &al[13], &ar[13],
+                                     &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+                                     cospi_29_64, cospi_3_64, &al[29], &ar[29],
+                                     &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+                                                      int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+  ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+  al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+  ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+  al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+  ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+  al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+  ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+  al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+  ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+  al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+  ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+  al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+  ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+  al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+  ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+  al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+  ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+  al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+  ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+  al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+  ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+  al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+  ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+  al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+  ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+  al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+  ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+  al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+  ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+  al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+  ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+  al[16] = add_round_shift_s32(left[16]);
+  ar[16] = add_round_shift_s32(right[16]);
+  al[17] = add_round_shift_s32(left[17]);
+  ar[17] = add_round_shift_s32(right[17]);
+  al[18] = add_round_shift_s32(left[18]);
+  ar[18] = add_round_shift_s32(right[18]);
+  al[19] = add_round_shift_s32(left[19]);
+  ar[19] = add_round_shift_s32(right[19]);
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[20] = add_round_shift_s32(al[20]);
+  ar[20] = add_round_shift_s32(ar[20]);
+  al[21] = add_round_shift_s32(al[21]);
+  ar[21] = add_round_shift_s32(ar[21]);
+  al[22] = add_round_shift_s32(al[22]);
+  ar[22] = add_round_shift_s32(ar[22]);
+  al[23] = add_round_shift_s32(al[23]);
+  ar[23] = add_round_shift_s32(ar[23]);
+  al[24] = add_round_shift_s32(al[24]);
+  ar[24] = add_round_shift_s32(ar[24]);
+  al[25] = add_round_shift_s32(al[25]);
+  ar[25] = add_round_shift_s32(ar[25]);
+  al[26] = add_round_shift_s32(al[26]);
+  ar[26] = add_round_shift_s32(ar[26]);
+  al[27] = add_round_shift_s32(al[27]);
+  ar[27] = add_round_shift_s32(ar[27]);
+
+  al[28] = add_round_shift_s32(left[28]);
+  ar[28] = add_round_shift_s32(right[28]);
+  al[29] = add_round_shift_s32(left[29]);
+  ar[29] = add_round_shift_s32(right[29]);
+  al[30] = add_round_shift_s32(left[30]);
+  ar[30] = add_round_shift_s32(right[30]);
+  al[31] = add_round_shift_s32(left[31]);
+  ar[31] = add_round_shift_s32(right[31]);
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[16], al[23]);
+  br[16] = vaddq_s32(ar[16], ar[23]);
+  bl[17] = vaddq_s32(al[17], al[22]);
+  br[17] = vaddq_s32(ar[17], ar[22]);
+  bl[18] = vaddq_s32(al[18], al[21]);
+  br[18] = vaddq_s32(ar[18], ar[21]);
+  bl[19] = vaddq_s32(al[19], al[20]);
+  br[19] = vaddq_s32(ar[19], ar[20]);
+
+  bl[20] = vsubq_s32(al[19], al[20]);
+  br[20] = vsubq_s32(ar[19], ar[20]);
+  bl[21] = vsubq_s32(al[18], al[21]);
+  br[21] = vsubq_s32(ar[18], ar[21]);
+  bl[22] = vsubq_s32(al[17], al[22]);
+  br[22] = vsubq_s32(ar[17], ar[22]);
+  bl[23] = vsubq_s32(al[16], al[23]);
+  br[23] = vsubq_s32(ar[16], ar[23]);
+
+  bl[24] = vsubq_s32(al[31], al[24]);
+  br[24] = vsubq_s32(ar[31], ar[24]);
+  bl[25] = vsubq_s32(al[30], al[25]);
+  br[25] = vsubq_s32(ar[30], ar[25]);
+  bl[26] = vsubq_s32(al[29], al[26]);
+  br[26] = vsubq_s32(ar[29], ar[26]);
+  bl[27] = vsubq_s32(al[28], al[27]);
+  br[27] = vsubq_s32(ar[28], ar[27]);
+
+  bl[28] = vaddq_s32(al[28], al[27]);
+  br[28] = vaddq_s32(ar[28], ar[27]);
+  bl[29] = vaddq_s32(al[29], al[26]);
+  br[29] = vaddq_s32(ar[29], ar[26]);
+  bl[30] = vaddq_s32(al[30], al[25]);
+  br[30] = vaddq_s32(ar[30], ar[25]);
+  bl[31] = vaddq_s32(al[31], al[24]);
+  br[31] = vaddq_s32(ar[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                          cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+  butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                          cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+  butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+                          -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+                          -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+                          &bl[2], &br[2], &bl[3], &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+                          &bl[14], &br[14], &bl[9], &br[9]);
+  butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+                          -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+                          &al[4], &ar[4], &al[7], &ar[7]);
+  butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+                          &al[5], &ar[5], &al[6], &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                          cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+                          -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+                          cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+                          -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+                          &bl[8], &br[8], &bl[15], &br[15]);
+  butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                          cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+  butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+                          cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+  butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+                          cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                          cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+                          cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                          cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+                          cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                          cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+                          cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+                          cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+                          cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 1ea948b3f7..b33da427b4 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -355,4 +355,13 @@ static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
   return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
 }
 
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 68244ea5a1..d55ab67ce1 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -568,10 +568,10 @@ ()
   specialize qw/vpx_highbd_fdct16x16_1 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32 sse2/;
+  specialize qw/vpx_highbd_fdct32x32 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+  specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct32x32_1 neon/;

From 5d26626e7aabaa8cd0aeb01618191216d2db90b9 Mon Sep 17 00:00:00 2001
From: Andrew Salkeld <andrew.salkeld@arm.com>
Date: Thu, 13 Oct 2022 16:28:41 +0100
Subject: [PATCH 469/926] Add Neon implementation of vpx_hadamard_32x32

Add an Arm Neon implementation of vpx_hadamard_32x32 and use it
instead of the scalar C implementation.

Also add test coverage for the new Neon implementation.

Change-Id: Iccc018eec4dbbe629fb0c6f8ad6ea8554e7a0b13
---
 test/hadamard_test.cc        |  3 ++-
 vpx_dsp/arm/hadamard_neon.c  | 42 ++++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  4 ++--
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 10b1e79c10..f904e814ad 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -264,7 +264,8 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardLowbdTest,
     ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8),
-                      HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16)));
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32)));
 #endif  // HAVE_NEON
 
 // TODO(jingning): Remove highbitdepth flag when the SIMD functions are
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index 523a63c6f7..f6b6d7e3ce 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -114,3 +114,45 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     coeff += 8;
   }
 }
+
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  int i;
+
+  /* Rearrange 32x32 to 16x64 and remove stride.
+   * Top left first. */
+  vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+                          coeff + 256);
+  /* Bottom left. */
+  vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+                          coeff + 512);
+  /* Bottom right. */
+  vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+                          coeff + 768);
+
+  for (i = 0; i < 256; i += 8) {
+    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256);
+    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
+    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vhaddq_s16(b0, b2);
+    const int16x8_t c1 = vhaddq_s16(b1, b3);
+    const int16x8_t c2 = vhsubq_s16(b0, b2);
+    const int16x8_t c3 = vhsubq_s16(b1, b3);
+
+    store_s16q_to_tran_low(coeff + 0, c0);
+    store_s16q_to_tran_low(coeff + 256, c1);
+    store_s16q_to_tran_low(coeff + 512, c2);
+    store_s16q_to_tran_low(coeff + 768, c3);
+
+    coeff += 8;
+  }
+}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d55ab67ce1..51f5ebedd6 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -799,7 +799,7 @@ ()
     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+    specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_highbd_hadamard_8x8 avx2/;
@@ -823,7 +823,7 @@ ()
     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+    specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
     add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon msa/;

From 62dee8012ea70a9f0628471609c5768f98a1e726 Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Sun, 6 Nov 2022 04:11:59 +0000
Subject: [PATCH 470/926] build: fix -Wimplicit-int (Clang 16)

Clang 16 will make -Wimplicit-int error by default which can, in addition to
other things, lead to some configure tests silently failing/returning the wrong result.

Fixes this error:
```
+/var/tmp/portage/media-libs/libvpx-1.12.0/temp/vpx-conf-1802-30624.c:1:15: error: type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int [-Wimplicit-int]
```

For more information, see LWN.net [0] or LLVM's Discourse [1], gentoo-dev@ [2],
or the (new) c-std-porting mailing list [3].

[0] https://lwn.net/Articles/913505/
[1] https://discourse.llvm.org/t/configure-script-breakage-with-the-new-werror-implicit-function-declaration/65213
[2] https://archives.gentoo.org/gentoo-dev/message/dd9f2d3082b8b6f8dfbccb0639e6e240
[3] hosted at lists.linux.dev.

Bug: https://bugs.gentoo.org/879705
Change-Id: Id73a98944ab3c99a368b9da7a5e902ddff9d937f
Signed-off-by: Sam James <sam@gentoo.org>
---
 build/make/configure.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index e9b7fa9c1c..4bf090f006 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1511,7 +1511,7 @@ EOF
 
     # Try to find which inline keywords are supported
     check_cc <<EOF && INLINE="inline"
-static inline function() {}
+static inline int function(void) {}
 EOF
 
   # Almost every platform uses pthreads.

From 78ac7af95c2d3befe0b077b05d914ac0c0d0c8cb Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 10 Nov 2022 18:50:19 -0800
Subject: [PATCH 471/926] vp9-rc: Fix key frame setting in external RC

Bug: b/257368998

Change-Id: I03e35915ac99b50cb6bdf7bce8b8f9ec5aef75b7
---
 vp9/ratectrl_rtc.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index f4d7f7e9e7..1326456c44 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -158,6 +158,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
   }
   vp9_set_mb_mi(cm, cm->width, cm->height);
   cm->frame_type = frame_params.frame_type;
+  // This is needed to ensure key frame does not get unset in rc_get_svc_params.
+  cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
   cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
   cpi_->sf.use_nonrd_pick_mode = 1;
   if (cpi_->svc.number_spatial_layers == 1 &&

From f951514a40554e55715d7a31f182581cdd2bf971 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 9 Nov 2022 09:30:58 +0000
Subject: [PATCH 472/926] [NEON] Optimize FHT functions, add highbd FHT 4x4

Refactor & optimize FHT functions further, use new butterfly functions
4x4 5% faster, 8x8 & 16x16 10% faster than previous versions.
Highbd 4x4 FHT version 2.27x faster than C version for --rt.

Change-Id: I3ebcd26010f6c5c067026aa9353cde46669c5d94
---
 test/dct_test.cc                    |    2 +
 vp9/common/vp9_rtcd_defs.pl         |    1 +
 vp9/encoder/arm/neon/vp9_dct_neon.c | 1248 +++++++++++----------------
 vpx_dsp/arm/fdct_neon.h             |   56 ++
 4 files changed, 565 insertions(+), 742 deletions(-)

diff --git a/test/dct_test.cc b/test/dct_test.cc
index 910d288bd5..0304029bd2 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -667,6 +667,8 @@ static const FuncInfo ht_neon_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
   { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
     2 },
+  { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>,
+    4, 2 },
   { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
     2 },
   { &vp9_highbd_fht16x16_c,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 871e4d0a35..f4bd9772c3 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -203,6 +203,7 @@ ()
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht4x4 neon/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index b8286a8dd5..5961be5f31 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -23,25 +23,25 @@
 
 static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
                                    int stride) {
-  // { 0, 1, 1, 1, 1, 1, 1, 1 };
-  const int16x8_t nonzero_bias_a = vextq_s16(vdupq_n_s16(0), vdupq_n_s16(1), 7);
-  // { 1, 0, 0, 0, 0, 0, 0, 0 };
-  const int16x8_t nonzero_bias_b = vextq_s16(vdupq_n_s16(1), vdupq_n_s16(0), 7);
-  int16x8_t mask;
+  // { 0, 1, 1, 1 };
+  const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3);
+  // { 1, 0, 0, 0 };
+  const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3);
+  int16x4_t mask;
 
   int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
   int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
   int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
   int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
 
-  in[0] = vcombine_s16(input_0, input_1);
-  in[1] = vcombine_s16(input_2, input_3);
-
   // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
   // one non-zero first elements
-  mask = vreinterpretq_s16_u16(vceqq_s16(in[0], nonzero_bias_a));
-  in[0] = vaddq_s16(in[0], mask);
-  in[0] = vaddq_s16(in[0], nonzero_bias_b);
+  mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a));
+  input_0 = vadd_s16(input_0, mask);
+  input_0 = vadd_s16(input_0, nonzero_bias_b);
+
+  in[0] = vcombine_s16(input_0, input_1);
+  in[1] = vcombine_s16(input_2, input_3);
 }
 
 static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
@@ -55,72 +55,54 @@ static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
 }
 
 static INLINE void fadst4x4_neon(int16x8_t *in) {
-  int32x4_t u0, u1, u2, u3;
-  int16x4_t out_0, out_1, out_2, out_3;
-  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+  int32x4_t u[4], t[4];
+  int16x4_t s[4], out[4];
 
-  const int16x4_t s0 = vget_low_s16(in[0]);   // | x_00 | x_01 | x_02 | x_03 |
-  const int16x4_t s1 = vget_high_s16(in[0]);  // | x_10 | x_11 | x_12 | x_13 |
-  const int16x4_t s2 = vget_low_s16(in[1]);   // | x_20 | x_21 | x_22 | x_23 |
-  const int16x4_t s3 = vget_high_s16(in[1]);  // | x_30 | x_31 | x_32 | x_33 |
+  s[0] = vget_low_s16(in[0]);   // | x_00 | x_01 | x_02 | x_03 |
+  s[1] = vget_high_s16(in[0]);  // | x_10 | x_11 | x_12 | x_13 |
+  s[2] = vget_low_s16(in[1]);   // | x_20 | x_21 | x_22 | x_23 |
+  s[3] = vget_high_s16(in[1]);  // | x_30 | x_31 | x_32 | x_33 |
 
-  // s0 * sinpi_1_9, s0 * sinpi_4_9
   // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-  const int32x4_t s0s1_9 = vmull_n_s16(s0, sinpi_1_9);
-  const int32x4_t s0s4_9 = vmull_n_s16(s0, sinpi_4_9);
-  // s1 * sinpi_1_9, s1 * sinpi_2_9
-  const int32x4_t s1s1_9 = vmull_n_s16(s1, sinpi_1_9);
-  const int32x4_t s1s2_9 = vmull_n_s16(s1, sinpi_2_9);
-  // s2 * sinpi_3_9
-  const int32x4_t s2s3_9 = vmull_n_s16(s2, sinpi_3_9);
-  // s3 * sinpi_2_9, s3 * sinpi_4_9
-  const int32x4_t s3s2_9 = vmull_n_s16(s3, sinpi_2_9);
-  const int32x4_t s3s4_9 = vmull_n_s16(s3, sinpi_4_9);
-
-  // (s0 + s1) * sinpi_3_9
-  const int32x4_t s0_p_s1 = vaddl_s16(s0, s1);
-  const int32x4_t s0_p_s1_m_s3 = vsubw_s16(s0_p_s1, s3);
-
-  // s_0 * sinpi_1_9 + s_1 * sinpi_2_9
-  // s_0 * sinpi_4_9 - s_1 * sinpi_1_9
-  const int32x4_t s0s1_9_p_s1s2_9 = vaddq_s32(s0s1_9, s1s2_9);
-  const int32x4_t s0s4_9_m_s1s1_9 = vsubq_s32(s0s4_9, s1s1_9);
-  /*
-   * t0 = s0s1_9 + s1s2_9 + s3s4_9
-   * t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
-   * t2 = s0s4_9 - s1s1_9 + s3s2_9
-   * t3 = s2s3_9
-   */
-  const int32x4_t t0 = vaddq_s32(s0s1_9_p_s1s2_9, s3s4_9);
-  const int32x4_t t1 = vmulq_n_s32(s0_p_s1_m_s3, sinpi_3_9);
-  const int32x4_t t2 = vaddq_s32(s0s4_9_m_s1s1_9, s3s2_9);
-  const int32x4_t t3 = s2s3_9;
+  // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+  t[0] = vmull_n_s16(s[0], sinpi_1_9);
+  t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9);
+  t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9);
+
+  // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+  t[1] = vmull_n_s16(s[0], sinpi_3_9);
+  t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9);
+  t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9);
+
+  // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+  t[2] = vmull_n_s16(s[0], sinpi_4_9);
+  t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9);
+  t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9);
+
+  // t3 = s2 * sinpi_3_9
+  t[3] = vmull_n_s16(s[2], sinpi_3_9);
+
   /*
    * u0 = t0 + t3
    * u1 = t1
    * u2 = t2 - t3
    * u3 = t2 - t0 + t3
    */
-  u0 = vaddq_s32(t0, t3);
-  u1 = t1;
-  u2 = vsubq_s32(t2, t3);
-  u3 = vaddq_s32(vsubq_s32(t2, t0), t3);
+  u[0] = vaddq_s32(t[0], t[3]);
+  u[1] = t[1];
+  u[2] = vsubq_s32(t[2], t[3]);
+  u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]);
 
   // fdct_round_shift
-  u0 = vaddq_s32(u0, k__DCT_CONST_ROUNDING);
-  u1 = vaddq_s32(u1, k__DCT_CONST_ROUNDING);
-  u2 = vaddq_s32(u2, k__DCT_CONST_ROUNDING);
-  u3 = vaddq_s32(u3, k__DCT_CONST_ROUNDING);
-
-  out_0 = vshrn_n_s32(u0, DCT_CONST_BITS);
-  out_1 = vshrn_n_s32(u1, DCT_CONST_BITS);
-  out_2 = vshrn_n_s32(u2, DCT_CONST_BITS);
-  out_3 = vshrn_n_s32(u3, DCT_CONST_BITS);
+  out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS);
+  out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS);
+  out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS);
+  out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS);
 
-  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
 
-  in[0] = vcombine_s16(out_0, out_1);
-  in[1] = vcombine_s16(out_2, out_3);
+  in[0] = vcombine_s16(out[0], out[1]);
+  in[1] = vcombine_s16(out[2], out[3]);
 }
 
 void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
@@ -239,245 +221,158 @@ static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res,
 }
 
 static INLINE void fadst8x8_neon(int16x8_t *in) {
-  int16x4_t x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi, x4_lo,
-      x4_hi, x5_lo, x5_hi, x6_lo, x6_hi, x7_lo, x7_hi;
-  int32x4_t s0_lo, s0_hi, s1_lo, s1_hi, s2_lo, s2_hi, s3_lo, s3_hi, s4_lo,
-      s4_hi, s5_lo, s5_hi, s6_lo, s6_hi, s7_lo, s7_hi;
-  int32x4_t t0_lo, t0_hi, t1_lo, t1_hi, t2_lo, t2_hi, t3_lo, t3_hi, t4_lo,
-      t4_hi, t5_lo, t5_hi, t6_lo, t6_hi, t7_lo, t7_hi;
-  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
-
-  x0_lo = vget_low_s16(in[7]);
-  x0_hi = vget_high_s16(in[7]);
-  x1_lo = vget_low_s16(in[0]);
-  x1_hi = vget_high_s16(in[0]);
-  x2_lo = vget_low_s16(in[5]);
-  x2_hi = vget_high_s16(in[5]);
-  x3_lo = vget_low_s16(in[2]);
-  x3_hi = vget_high_s16(in[2]);
-  x4_lo = vget_low_s16(in[3]);
-  x4_hi = vget_high_s16(in[3]);
-  x5_lo = vget_low_s16(in[4]);
-  x5_hi = vget_high_s16(in[4]);
-  x6_lo = vget_low_s16(in[1]);
-  x6_hi = vget_high_s16(in[1]);
-  x7_lo = vget_low_s16(in[6]);
-  x7_hi = vget_high_s16(in[6]);
+  int16x4_t x_lo[8], x_hi[8];
+  int32x4_t s_lo[8], s_hi[8];
+  int32x4_t t_lo[8], t_hi[8];
+
+  x_lo[0] = vget_low_s16(in[7]);
+  x_hi[0] = vget_high_s16(in[7]);
+  x_lo[1] = vget_low_s16(in[0]);
+  x_hi[1] = vget_high_s16(in[0]);
+  x_lo[2] = vget_low_s16(in[5]);
+  x_hi[2] = vget_high_s16(in[5]);
+  x_lo[3] = vget_low_s16(in[2]);
+  x_hi[3] = vget_high_s16(in[2]);
+  x_lo[4] = vget_low_s16(in[3]);
+  x_hi[4] = vget_high_s16(in[3]);
+  x_lo[5] = vget_low_s16(in[4]);
+  x_hi[5] = vget_high_s16(in[4]);
+  x_lo[6] = vget_low_s16(in[1]);
+  x_hi[6] = vget_high_s16(in[1]);
+  x_lo[7] = vget_low_s16(in[6]);
+  x_hi[7] = vget_high_s16(in[6]);
 
   // stage 1
   // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s0_lo = vaddq_s32(vmull_n_s16(x0_lo, cospi_2_64),
-                    vmull_n_s16(x1_lo, cospi_30_64));
-  s0_hi = vaddq_s32(vmull_n_s16(x0_hi, cospi_2_64),
-                    vmull_n_s16(x1_hi, cospi_30_64));
   // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s1_lo = vsubq_s32(vmull_n_s16(x0_lo, cospi_30_64),
-                    vmull_n_s16(x1_lo, cospi_2_64));
-  s1_hi = vsubq_s32(vmull_n_s16(x0_hi, cospi_30_64),
-                    vmull_n_s16(x1_hi, cospi_2_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+                                      cospi_2_64, cospi_30_64, &s_lo[0],
+                                      &s_hi[0], &s_lo[1], &s_hi[1]);
+
   // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s2_lo = vaddq_s32(vmull_n_s16(x2_lo, cospi_10_64),
-                    vmull_n_s16(x3_lo, cospi_22_64));
-  s2_hi = vaddq_s32(vmull_n_s16(x2_hi, cospi_10_64),
-                    vmull_n_s16(x3_hi, cospi_22_64));
   // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s3_lo = vsubq_s32(vmull_n_s16(x2_lo, cospi_22_64),
-                    vmull_n_s16(x3_lo, cospi_10_64));
-  s3_hi = vsubq_s32(vmull_n_s16(x2_hi, cospi_22_64),
-                    vmull_n_s16(x3_hi, cospi_10_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+                                      cospi_10_64, cospi_22_64, &s_lo[2],
+                                      &s_hi[2], &s_lo[3], &s_hi[3]);
+
   // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s4_lo = vaddq_s32(vmull_n_s16(x4_lo, cospi_18_64),
-                    vmull_n_s16(x5_lo, cospi_14_64));
-  s4_hi = vaddq_s32(vmull_n_s16(x4_hi, cospi_18_64),
-                    vmull_n_s16(x5_hi, cospi_14_64));
   // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s5_lo = vsubq_s32(vmull_n_s16(x4_lo, cospi_14_64),
-                    vmull_n_s16(x5_lo, cospi_18_64));
-  s5_hi = vsubq_s32(vmull_n_s16(x4_hi, cospi_14_64),
-                    vmull_n_s16(x5_hi, cospi_18_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+                                      cospi_18_64, cospi_14_64, &s_lo[4],
+                                      &s_hi[4], &s_lo[5], &s_hi[5]);
+
   // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s6_lo = vaddq_s32(vmull_n_s16(x6_lo, cospi_26_64),
-                    vmull_n_s16(x7_lo, cospi_6_64));
-  s6_hi = vaddq_s32(vmull_n_s16(x6_hi, cospi_26_64),
-                    vmull_n_s16(x7_hi, cospi_6_64));
   // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-  s7_lo = vsubq_s32(vmull_n_s16(x6_lo, cospi_6_64),
-                    vmull_n_s16(x7_lo, cospi_26_64));
-  s7_hi = vsubq_s32(vmull_n_s16(x6_hi, cospi_6_64),
-                    vmull_n_s16(x7_hi, cospi_26_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+                                      cospi_26_64, cospi_6_64, &s_lo[6],
+                                      &s_hi[6], &s_lo[7], &s_hi[7]);
 
   // fdct_round_shift
-  t0_lo = vaddq_s32(s0_lo, s4_lo);
-  t0_hi = vaddq_s32(s0_hi, s4_hi);
-  t1_lo = vaddq_s32(s1_lo, s5_lo);
-  t1_hi = vaddq_s32(s1_hi, s5_hi);
-  t2_lo = vaddq_s32(s2_lo, s6_lo);
-  t2_hi = vaddq_s32(s2_hi, s6_hi);
-  t3_lo = vaddq_s32(s3_lo, s7_lo);
-  t3_hi = vaddq_s32(s3_hi, s7_hi);
-  t4_lo = vsubq_s32(s0_lo, s4_lo);
-  t4_hi = vsubq_s32(s0_hi, s4_hi);
-  t5_lo = vsubq_s32(s1_lo, s5_lo);
-  t5_hi = vsubq_s32(s1_hi, s5_hi);
-  t6_lo = vsubq_s32(s2_lo, s6_lo);
-  t6_hi = vsubq_s32(s2_hi, s6_hi);
-  t7_lo = vsubq_s32(s3_lo, s7_lo);
-  t7_hi = vsubq_s32(s3_hi, s7_hi);
-
-  t0_lo = vaddq_s32(t0_lo, k__DCT_CONST_ROUNDING);
-  t0_hi = vaddq_s32(t0_hi, k__DCT_CONST_ROUNDING);
-  t1_lo = vaddq_s32(t1_lo, k__DCT_CONST_ROUNDING);
-  t1_hi = vaddq_s32(t1_hi, k__DCT_CONST_ROUNDING);
-  t2_lo = vaddq_s32(t2_lo, k__DCT_CONST_ROUNDING);
-  t2_hi = vaddq_s32(t2_hi, k__DCT_CONST_ROUNDING);
-  t3_lo = vaddq_s32(t3_lo, k__DCT_CONST_ROUNDING);
-  t3_hi = vaddq_s32(t3_hi, k__DCT_CONST_ROUNDING);
-  t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
-  t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
-  t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
-  t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
-  t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
-  t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
-  t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
-  t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
-
-  t0_lo = vshrq_n_s32(t0_lo, DCT_CONST_BITS);
-  t0_hi = vshrq_n_s32(t0_hi, DCT_CONST_BITS);
-  t1_lo = vshrq_n_s32(t1_lo, DCT_CONST_BITS);
-  t1_hi = vshrq_n_s32(t1_hi, DCT_CONST_BITS);
-  t2_lo = vshrq_n_s32(t2_lo, DCT_CONST_BITS);
-  t2_hi = vshrq_n_s32(t2_hi, DCT_CONST_BITS);
-  t3_lo = vshrq_n_s32(t3_lo, DCT_CONST_BITS);
-  t3_hi = vshrq_n_s32(t3_hi, DCT_CONST_BITS);
-  t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
-  t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
-  t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
-  t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
-  t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
-  t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
-  t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
-  t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+  t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+  t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+  t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+  t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+  t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+  t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+  t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+  t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
 
   // stage 2
-  s0_lo = t0_lo;
-  s0_hi = t0_hi;
-  s1_lo = t1_lo;
-  s1_hi = t1_hi;
-  s2_lo = t2_lo;
-  s2_hi = t2_hi;
-  s3_lo = t3_lo;
-  s3_hi = t3_hi;
-  s4_lo = vaddq_s32(vmulq_n_s32(t4_lo, cospi_8_64),
-                    vmulq_n_s32(t5_lo, cospi_24_64));
-  s4_hi = vaddq_s32(vmulq_n_s32(t4_hi, cospi_8_64),
-                    vmulq_n_s32(t5_hi, cospi_24_64));
-  s5_lo = vsubq_s32(vmulq_n_s32(t4_lo, cospi_24_64),
-                    vmulq_n_s32(t5_lo, cospi_8_64));
-  s5_hi = vsubq_s32(vmulq_n_s32(t4_hi, cospi_24_64),
-                    vmulq_n_s32(t5_hi, cospi_8_64));
-  s6_lo = vaddq_s32(vmulq_n_s32(t6_lo, -cospi_24_64),
-                    vmulq_n_s32(t7_lo, cospi_8_64));
-  s6_hi = vaddq_s32(vmulq_n_s32(t6_hi, -cospi_24_64),
-                    vmulq_n_s32(t7_hi, cospi_8_64));
-  s7_lo = vaddq_s32(vmulq_n_s32(t6_lo, cospi_8_64),
-                    vmulq_n_s32(t7_lo, cospi_24_64));
-  s7_hi = vaddq_s32(vmulq_n_s32(t6_hi, cospi_8_64),
-                    vmulq_n_s32(t7_hi, cospi_24_64));
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+                                  cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+                                  &s_lo[5], &s_hi[5]);
+
+  // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+                                  -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6],
+                                  &s_lo[7], &s_hi[7]);
 
+  // fdct_round_shift
   // s0 + s2
-  t0_lo = vaddq_s32(s0_lo, s2_lo);
-  t0_hi = vaddq_s32(s0_hi, s2_hi);
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
   // s1 + s3
-  t1_lo = vaddq_s32(s1_lo, s3_lo);
-  t1_hi = vaddq_s32(s1_hi, s3_hi);
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
   // s0 - s2
-  t2_lo = vsubq_s32(s0_lo, s2_lo);
-  t2_hi = vsubq_s32(s0_hi, s2_hi);
+  t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+  t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
   // s1 - s3
-  t3_lo = vsubq_s32(s1_lo, s3_lo);
-  t3_hi = vsubq_s32(s1_hi, s3_hi);
+  t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+  t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
   // s4 + s6
-  t4_lo = vaddq_s32(s4_lo, s6_lo);
-  t4_hi = vaddq_s32(s4_hi, s6_hi);
+  t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
   // s5 + s7
-  t5_lo = vaddq_s32(s5_lo, s7_lo);
-  t5_hi = vaddq_s32(s5_hi, s7_hi);
+  t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
   // s4 - s6
-  t6_lo = vsubq_s32(s4_lo, s6_lo);
-  t6_hi = vsubq_s32(s4_hi, s6_hi);
+  t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
   // s5 - s7
-  t7_lo = vsubq_s32(s5_lo, s7_lo);
-  t7_hi = vsubq_s32(s5_hi, s7_hi);
-
-  // fdct_round_shift
-  t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
-  t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
-  t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
-  t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
-  t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
-  t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
-  t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
-  t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
-  t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
-  t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
-  t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
-  t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
-  t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
-  t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
-  t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
-  t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
 
   // stage 3
   // cospi_16_64 * (x2 + x3)
-  s2_lo = vmulq_n_s32(vaddq_s32(t2_lo, t3_lo), cospi_16_64);
-  s2_hi = vmulq_n_s32(vaddq_s32(t2_hi, t3_hi), cospi_16_64);
   // cospi_16_64 * (x2 - x3)
-  s3_lo = vmulq_n_s32(vsubq_s32(t2_lo, t3_lo), cospi_16_64);
-  s3_hi = vmulq_n_s32(vsubq_s32(t2_hi, t3_hi), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3],
+                                  cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+                                  &s_hi[3]);
+
   // cospi_16_64 * (x6 + x7)
-  s6_lo = vmulq_n_s32(vaddq_s32(t6_lo, t7_lo), cospi_16_64);
-  s6_hi = vmulq_n_s32(vaddq_s32(t6_hi, t7_hi), cospi_16_64);
   // cospi_16_64 * (x2 - x3)
-  s7_lo = vmulq_n_s32(vsubq_s32(t6_lo, t7_lo), cospi_16_64);
-  s7_hi = vmulq_n_s32(vsubq_s32(t6_hi, t7_hi), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+                                  cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+                                  &s_hi[7]);
 
   // final fdct_round_shift
-  t2_lo = vaddq_s32(s2_lo, k__DCT_CONST_ROUNDING);
-  t2_hi = vaddq_s32(s2_hi, k__DCT_CONST_ROUNDING);
-  t3_lo = vaddq_s32(s3_lo, k__DCT_CONST_ROUNDING);
-  t3_hi = vaddq_s32(s3_hi, k__DCT_CONST_ROUNDING);
-  t6_lo = vaddq_s32(s6_lo, k__DCT_CONST_ROUNDING);
-  t6_hi = vaddq_s32(s6_hi, k__DCT_CONST_ROUNDING);
-  t7_lo = vaddq_s32(s7_lo, k__DCT_CONST_ROUNDING);
-  t7_hi = vaddq_s32(s7_hi, k__DCT_CONST_ROUNDING);
-
-  x2_lo = vshrn_n_s32(t2_lo, DCT_CONST_BITS);
-  x2_hi = vshrn_n_s32(t2_hi, DCT_CONST_BITS);
-  x3_lo = vshrn_n_s32(t3_lo, DCT_CONST_BITS);
-  x3_hi = vshrn_n_s32(t3_hi, DCT_CONST_BITS);
-  x6_lo = vshrn_n_s32(t6_lo, DCT_CONST_BITS);
-  x6_hi = vshrn_n_s32(t6_hi, DCT_CONST_BITS);
-  x7_lo = vshrn_n_s32(t7_lo, DCT_CONST_BITS);
-  x7_hi = vshrn_n_s32(t7_hi, DCT_CONST_BITS);
+  x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
 
   // x0, x1, x4, x5 narrow down to 16-bits directly
-  x0_lo = vmovn_s32(t0_lo);
-  x0_hi = vmovn_s32(t0_hi);
-  x1_lo = vmovn_s32(t1_lo);
-  x1_hi = vmovn_s32(t1_hi);
-  x4_lo = vmovn_s32(t4_lo);
-  x4_hi = vmovn_s32(t4_hi);
-  x5_lo = vmovn_s32(t5_lo);
-  x5_hi = vmovn_s32(t5_hi);
-
-  in[0] = vcombine_s16(x0_lo, x0_hi);
-  in[1] = vnegq_s16(vcombine_s16(x4_lo, x4_hi));
-  in[2] = vcombine_s16(x6_lo, x6_hi);
-  in[3] = vnegq_s16(vcombine_s16(x2_lo, x2_hi));
-  in[4] = vcombine_s16(x3_lo, x3_hi);
-  in[5] = vnegq_s16(vcombine_s16(x7_lo, x7_hi));
-  in[6] = vcombine_s16(x5_lo, x5_hi);
-  in[7] = vnegq_s16(vcombine_s16(x1_lo, x1_hi));
+  x_lo[0] = vmovn_s32(t_lo[0]);
+  x_hi[0] = vmovn_s32(t_hi[0]);
+  x_lo[1] = vmovn_s32(t_lo[1]);
+  x_hi[1] = vmovn_s32(t_hi[1]);
+  x_lo[4] = vmovn_s32(t_lo[4]);
+  x_hi[4] = vmovn_s32(t_hi[4]);
+  x_lo[5] = vmovn_s32(t_lo[5]);
+  x_hi[5] = vmovn_s32(t_hi[5]);
+
+  in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+  in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+  in[2] = vcombine_s16(x_lo[6], x_hi[6]);
+  in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2]));
+  in[4] = vcombine_s16(x_lo[3], x_hi[3]);
+  in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7]));
+  in[6] = vcombine_s16(x_lo[5], x_hi[5]);
+  in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
 
   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
                     &in[7]);
@@ -553,7 +448,6 @@ static void fdct16_8col(int16x8_t *in) {
   int16x8_t i[8], s1[8], s2[8], s3[8], t[8];
   int16x4_t t_lo[8], t_hi[8];
   int32x4_t u_lo[8], u_hi[8];
-  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
 
   // stage 1
   i[0] = vaddq_s16(in[0], in[15]);
@@ -602,23 +496,14 @@ static void fdct16_8col(int16x8_t *in) {
   u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64);
   u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64);
 
-  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
-  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
-  u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
-  u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
-  u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
-  u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
-  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
-  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
-
-  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
-  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
-  t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
-  t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
-  t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
-  t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
-  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
 
   s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
   s2[3] = vcombine_s16(t_lo[3], t_hi[3]);
@@ -653,40 +538,26 @@ static void fdct16_8col(int16x8_t *in) {
   t_lo[7] = vget_low_s16(s3[7]);
   t_hi[7] = vget_high_s16(s3[7]);
 
-  u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_8_64),
-                      vmull_n_s16(t_lo[6], cospi_24_64));
-  u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_8_64),
-                      vmull_n_s16(t_hi[6], cospi_24_64));
-  u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_24_64),
-                      vmull_n_s16(t_lo[5], cospi_8_64));
-  u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_24_64),
-                      vmull_n_s16(t_hi[5], cospi_8_64));
-  u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_8_64),
-                      vmull_n_s16(t_lo[5], -cospi_24_64));
-  u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_8_64),
-                      vmull_n_s16(t_hi[5], -cospi_24_64));
-  u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_24_64),
-                      vmull_n_s16(t_lo[6], cospi_8_64));
-  u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_24_64),
-                      vmull_n_s16(t_hi[6], cospi_8_64));
-
-  u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
-  u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
-  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
-  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
-  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
-  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
-  u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
-  u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
-
-  t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
-  t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
-  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
-  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
-  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
-  t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
-  t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+  // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6]
+  // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6]
+  butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6],
+                                      -cospi_8_64, cospi_24_64, &u_lo[1],
+                                      &u_hi[1], &u_lo[6], &u_hi[6]);
+
+  // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2]
+  // u[2] = cospi_8_64 * t[5]   + cospi_24_64 * t[2]
+  butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+                                      -cospi_24_64, cospi_8_64, &u_lo[5],
+                                      &u_hi[5], &u_lo[2], &u_hi[2]);
+
+  t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
 
   s2[1] = vcombine_s16(t_lo[1], t_hi[1]);
   s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
@@ -721,88 +592,47 @@ static void fdct16_8col(int16x8_t *in) {
   t_lo[7] = vget_low_s16(s1[7]);
   t_hi[7] = vget_high_s16(s1[7]);
 
-  // step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
-  u_lo[0] = vaddq_s32(vmull_n_s16(t_lo[0], cospi_30_64),
-                      vmull_n_s16(t_lo[7], cospi_2_64));
-  u_hi[0] = vaddq_s32(vmull_n_s16(t_hi[0], cospi_30_64),
-                      vmull_n_s16(t_hi[7], cospi_2_64));
-
-  // step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_14_64),
-                      vmull_n_s16(t_lo[6], cospi_18_64));
-  u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_14_64),
-                      vmull_n_s16(t_hi[6], cospi_18_64));
-
-  // step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-  u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_22_64),
-                      vmull_n_s16(t_lo[5], cospi_10_64));
-  u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_22_64),
-                      vmull_n_s16(t_hi[5], cospi_10_64));
-
-  // step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
-  u_lo[3] = vaddq_s32(vmull_n_s16(t_lo[3], cospi_6_64),
-                      vmull_n_s16(t_lo[4], cospi_26_64));
-  u_hi[3] = vaddq_s32(vmull_n_s16(t_hi[3], cospi_6_64),
-                      vmull_n_s16(t_hi[4], cospi_26_64));
-
-  // step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
-  u_lo[4] = vaddq_s32(vmull_n_s16(t_lo[3], -cospi_26_64),
-                      vmull_n_s16(t_lo[4], cospi_6_64));
-  u_hi[4] = vaddq_s32(vmull_n_s16(t_hi[3], -cospi_26_64),
-                      vmull_n_s16(t_hi[4], cospi_6_64));
-
-  // step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], -cospi_10_64),
-                      vmull_n_s16(t_lo[5], cospi_22_64));
-  u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], -cospi_10_64),
-                      vmull_n_s16(t_hi[5], cospi_22_64));
-
-  // step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-  u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_18_64),
-                      vmull_n_s16(t_lo[6], cospi_14_64));
-  u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_18_64),
-                      vmull_n_s16(t_hi[6], cospi_14_64));
-
-  // step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
-  u_lo[7] = vaddq_s32(vmull_n_s16(t_lo[0], -cospi_2_64),
-                      vmull_n_s16(t_lo[7], cospi_30_64));
-  u_hi[7] = vaddq_s32(vmull_n_s16(t_hi[0], -cospi_2_64),
-                      vmull_n_s16(t_hi[7], cospi_30_64));
+  // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+  // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0],
+                                      cospi_2_64, cospi_30_64, &u_lo[0],
+                                      &u_hi[0], &u_lo[7], &u_hi[7]);
+
+  // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+  // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1],
+                                      cospi_18_64, cospi_14_64, &u_lo[1],
+                                      &u_hi[1], &u_lo[6], &u_hi[6]);
+
+  // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+  // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+                                      cospi_10_64, cospi_22_64, &u_lo[2],
+                                      &u_hi[2], &u_lo[5], &u_hi[5]);
+
+  // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+  // u[4] = step1[4] * cospi_6_64  - step1[3] * cospi_26_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3],
+                                      cospi_26_64, cospi_6_64, &u_lo[3],
+                                      &u_hi[3], &u_lo[4], &u_hi[4]);
 
   // final fdct_round_shift
-  u_lo[0] = vaddq_s32(u_lo[0], k__DCT_CONST_ROUNDING);
-  u_hi[0] = vaddq_s32(u_hi[0], k__DCT_CONST_ROUNDING);
-  u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
-  u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
-  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
-  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
-  u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
-  u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
-  u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
-  u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
-  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
-  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
-  u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
-  u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
-  u_lo[7] = vaddq_s32(u_lo[7], k__DCT_CONST_ROUNDING);
-  u_hi[7] = vaddq_s32(u_hi[7], k__DCT_CONST_ROUNDING);
-
-  t_lo[0] = vshrn_n_s32(u_lo[0], DCT_CONST_BITS);
-  t_hi[0] = vshrn_n_s32(u_hi[0], DCT_CONST_BITS);
-  t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
-  t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
-  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
-  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
-  t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
-  t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
-  t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
-  t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
-  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
-  t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
-  t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
-  t_lo[7] = vshrn_n_s32(u_lo[7], DCT_CONST_BITS);
-  t_hi[7] = vshrn_n_s32(u_hi[7], DCT_CONST_BITS);
+  t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS);
+  t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS);
+  t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS);
 
   in[0] = i[0];
   in[2] = i[1];
@@ -827,7 +657,6 @@ static void fadst16_8col(int16x8_t *in) {
   int16x4_t x_lo[16], x_hi[16];
   int32x4_t s_lo[16], s_hi[16];
   int32x4_t t_lo[16], t_hi[16];
-  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
 
   x_lo[0] = vget_low_s16(in[15]);
   x_hi[0] = vget_high_s16(in[15]);
@@ -864,185 +693,79 @@ static void fadst16_8col(int16x8_t *in) {
 
   // stage 1
   // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
-  s_lo[0] = vaddq_s32(vmull_n_s16(x_lo[0], cospi_1_64),
-                      vmull_n_s16(x_lo[1], cospi_31_64));
-  s_hi[0] = vaddq_s32(vmull_n_s16(x_hi[0], cospi_1_64),
-                      vmull_n_s16(x_hi[1], cospi_31_64));
   // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
-  s_lo[1] = vsubq_s32(vmull_n_s16(x_lo[0], cospi_31_64),
-                      vmull_n_s16(x_lo[1], cospi_1_64));
-  s_hi[1] = vsubq_s32(vmull_n_s16(x_hi[0], cospi_31_64),
-                      vmull_n_s16(x_hi[1], cospi_1_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+                                      cospi_1_64, cospi_31_64, &s_lo[0],
+                                      &s_hi[0], &s_lo[1], &s_hi[1]);
   // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
-  s_lo[2] = vaddq_s32(vmull_n_s16(x_lo[2], cospi_5_64),
-                      vmull_n_s16(x_lo[3], cospi_27_64));
-  s_hi[2] = vaddq_s32(vmull_n_s16(x_hi[2], cospi_5_64),
-                      vmull_n_s16(x_hi[3], cospi_27_64));
   // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
-  s_lo[3] = vsubq_s32(vmull_n_s16(x_lo[2], cospi_27_64),
-                      vmull_n_s16(x_lo[3], cospi_5_64));
-  s_hi[3] = vsubq_s32(vmull_n_s16(x_hi[2], cospi_27_64),
-                      vmull_n_s16(x_hi[3], cospi_5_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+                                      cospi_5_64, cospi_27_64, &s_lo[2],
+                                      &s_hi[2], &s_lo[3], &s_hi[3]);
   // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
-  s_lo[4] = vaddq_s32(vmull_n_s16(x_lo[4], cospi_9_64),
-                      vmull_n_s16(x_lo[5], cospi_23_64));
-  s_hi[4] = vaddq_s32(vmull_n_s16(x_hi[4], cospi_9_64),
-                      vmull_n_s16(x_hi[5], cospi_23_64));
   // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
-  s_lo[5] = vsubq_s32(vmull_n_s16(x_lo[4], cospi_23_64),
-                      vmull_n_s16(x_lo[5], cospi_9_64));
-  s_hi[5] = vsubq_s32(vmull_n_s16(x_hi[4], cospi_23_64),
-                      vmull_n_s16(x_hi[5], cospi_9_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+                                      cospi_9_64, cospi_23_64, &s_lo[4],
+                                      &s_hi[4], &s_lo[5], &s_hi[5]);
   // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
-  s_lo[6] = vaddq_s32(vmull_n_s16(x_lo[6], cospi_13_64),
-                      vmull_n_s16(x_lo[7], cospi_19_64));
-  s_hi[6] = vaddq_s32(vmull_n_s16(x_hi[6], cospi_13_64),
-                      vmull_n_s16(x_hi[7], cospi_19_64));
   // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
-  s_lo[7] = vsubq_s32(vmull_n_s16(x_lo[6], cospi_19_64),
-                      vmull_n_s16(x_lo[7], cospi_13_64));
-  s_hi[7] = vsubq_s32(vmull_n_s16(x_hi[6], cospi_19_64),
-                      vmull_n_s16(x_hi[7], cospi_13_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+                                      cospi_13_64, cospi_19_64, &s_lo[6],
+                                      &s_hi[6], &s_lo[7], &s_hi[7]);
   // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
-  s_lo[8] = vaddq_s32(vmull_n_s16(x_lo[8], cospi_17_64),
-                      vmull_n_s16(x_lo[9], cospi_15_64));
-  s_hi[8] = vaddq_s32(vmull_n_s16(x_hi[8], cospi_17_64),
-                      vmull_n_s16(x_hi[9], cospi_15_64));
   // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
-  s_lo[9] = vsubq_s32(vmull_n_s16(x_lo[8], cospi_15_64),
-                      vmull_n_s16(x_lo[9], cospi_17_64));
-  s_hi[9] = vsubq_s32(vmull_n_s16(x_hi[8], cospi_15_64),
-                      vmull_n_s16(x_hi[9], cospi_17_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9],
+                                      cospi_17_64, cospi_15_64, &s_lo[8],
+                                      &s_hi[8], &s_lo[9], &s_hi[9]);
   // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
-  s_lo[10] = vaddq_s32(vmull_n_s16(x_lo[10], cospi_21_64),
-                       vmull_n_s16(x_lo[11], cospi_11_64));
-  s_hi[10] = vaddq_s32(vmull_n_s16(x_hi[10], cospi_21_64),
-                       vmull_n_s16(x_hi[11], cospi_11_64));
   // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
-  s_lo[11] = vsubq_s32(vmull_n_s16(x_lo[10], cospi_11_64),
-                       vmull_n_s16(x_lo[11], cospi_21_64));
-  s_hi[11] = vsubq_s32(vmull_n_s16(x_hi[10], cospi_11_64),
-                       vmull_n_s16(x_hi[11], cospi_21_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11],
+                                      cospi_21_64, cospi_11_64, &s_lo[10],
+                                      &s_hi[10], &s_lo[11], &s_hi[11]);
   // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
-  s_lo[12] = vaddq_s32(vmull_n_s16(x_lo[12], cospi_25_64),
-                       vmull_n_s16(x_lo[13], cospi_7_64));
-  s_hi[12] = vaddq_s32(vmull_n_s16(x_hi[12], cospi_25_64),
-                       vmull_n_s16(x_hi[13], cospi_7_64));
   // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
-  s_lo[13] = vsubq_s32(vmull_n_s16(x_lo[12], cospi_7_64),
-                       vmull_n_s16(x_lo[13], cospi_25_64));
-  s_hi[13] = vsubq_s32(vmull_n_s16(x_hi[12], cospi_7_64),
-                       vmull_n_s16(x_hi[13], cospi_25_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13],
+                                      cospi_25_64, cospi_7_64, &s_lo[12],
+                                      &s_hi[12], &s_lo[13], &s_hi[13]);
   // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
-  s_lo[14] = vaddq_s32(vmull_n_s16(x_lo[14], cospi_29_64),
-                       vmull_n_s16(x_lo[15], cospi_3_64));
-  s_hi[14] = vaddq_s32(vmull_n_s16(x_hi[14], cospi_29_64),
-                       vmull_n_s16(x_hi[15], cospi_3_64));
   // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
-  s_lo[15] = vsubq_s32(vmull_n_s16(x_lo[14], cospi_3_64),
-                       vmull_n_s16(x_lo[15], cospi_29_64));
-  s_hi[15] = vsubq_s32(vmull_n_s16(x_hi[14], cospi_3_64),
-                       vmull_n_s16(x_hi[15], cospi_29_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15],
+                                      cospi_29_64, cospi_3_64, &s_lo[14],
+                                      &s_hi[14], &s_lo[15], &s_hi[15]);
 
   // fdct_round_shift
-  t_lo[0] = vaddq_s32(s_lo[0], s_lo[8]);
-  t_hi[0] = vaddq_s32(s_hi[0], s_hi[8]);
-  t_lo[1] = vaddq_s32(s_lo[1], s_lo[9]);
-  t_hi[1] = vaddq_s32(s_hi[1], s_hi[9]);
-  t_lo[2] = vaddq_s32(s_lo[2], s_lo[10]);
-  t_hi[2] = vaddq_s32(s_hi[2], s_hi[10]);
-  t_lo[3] = vaddq_s32(s_lo[3], s_lo[11]);
-  t_hi[3] = vaddq_s32(s_hi[3], s_hi[11]);
-  t_lo[4] = vaddq_s32(s_lo[4], s_lo[12]);
-  t_hi[4] = vaddq_s32(s_hi[4], s_hi[12]);
-  t_lo[5] = vaddq_s32(s_lo[5], s_lo[13]);
-  t_hi[5] = vaddq_s32(s_hi[5], s_hi[13]);
-  t_lo[6] = vaddq_s32(s_lo[6], s_lo[14]);
-  t_hi[6] = vaddq_s32(s_hi[6], s_hi[14]);
-  t_lo[7] = vaddq_s32(s_lo[7], s_lo[15]);
-  t_hi[7] = vaddq_s32(s_hi[7], s_hi[15]);
-  t_lo[8] = vsubq_s32(s_lo[0], s_lo[8]);
-  t_hi[8] = vsubq_s32(s_hi[0], s_hi[8]);
-  t_lo[9] = vsubq_s32(s_lo[1], s_lo[9]);
-  t_hi[9] = vsubq_s32(s_hi[1], s_hi[9]);
-  t_lo[10] = vsubq_s32(s_lo[2], s_lo[10]);
-  t_hi[10] = vsubq_s32(s_hi[2], s_hi[10]);
-  t_lo[11] = vsubq_s32(s_lo[3], s_lo[11]);
-  t_hi[11] = vsubq_s32(s_hi[3], s_hi[11]);
-  t_lo[12] = vsubq_s32(s_lo[4], s_lo[12]);
-  t_hi[12] = vsubq_s32(s_hi[4], s_hi[12]);
-  t_lo[13] = vsubq_s32(s_lo[5], s_lo[13]);
-  t_hi[13] = vsubq_s32(s_hi[5], s_hi[13]);
-  t_lo[14] = vsubq_s32(s_lo[6], s_lo[14]);
-  t_hi[14] = vsubq_s32(s_hi[6], s_hi[14]);
-  t_lo[15] = vsubq_s32(s_lo[7], s_lo[15]);
-  t_hi[15] = vsubq_s32(s_hi[7], s_hi[15]);
-
-  t_lo[0] = vaddq_s32(t_lo[0], k__DCT_CONST_ROUNDING);
-  t_hi[0] = vaddq_s32(t_hi[0], k__DCT_CONST_ROUNDING);
-  t_lo[1] = vaddq_s32(t_lo[1], k__DCT_CONST_ROUNDING);
-  t_hi[1] = vaddq_s32(t_hi[1], k__DCT_CONST_ROUNDING);
-  t_lo[2] = vaddq_s32(t_lo[2], k__DCT_CONST_ROUNDING);
-  t_hi[2] = vaddq_s32(t_hi[2], k__DCT_CONST_ROUNDING);
-  t_lo[3] = vaddq_s32(t_lo[3], k__DCT_CONST_ROUNDING);
-  t_hi[3] = vaddq_s32(t_hi[3], k__DCT_CONST_ROUNDING);
-  t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
-  t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
-  t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
-  t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
-  t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
-  t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
-  t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
-  t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
-  t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
-  t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
-  t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
-  t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
-  t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
-  t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
-  t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
-  t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
-  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
-  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
-  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
-  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
-  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
-  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
-  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
-  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
-
-  t_lo[0] = vshrq_n_s32(t_lo[0], DCT_CONST_BITS);
-  t_hi[0] = vshrq_n_s32(t_hi[0], DCT_CONST_BITS);
-  t_lo[1] = vshrq_n_s32(t_lo[1], DCT_CONST_BITS);
-  t_hi[1] = vshrq_n_s32(t_hi[1], DCT_CONST_BITS);
-  t_lo[2] = vshrq_n_s32(t_lo[2], DCT_CONST_BITS);
-  t_hi[2] = vshrq_n_s32(t_hi[2], DCT_CONST_BITS);
-  t_lo[3] = vshrq_n_s32(t_lo[3], DCT_CONST_BITS);
-  t_hi[3] = vshrq_n_s32(t_hi[3], DCT_CONST_BITS);
-  t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
-  t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
-  t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
-  t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
-  t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
-  t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
-  t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
-  t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
-  t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
-  t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
-  t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
-  t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
-  t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
-  t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
-  t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
-  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
-  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
-  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
-  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
-  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
-  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
-  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
-  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+  t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+  t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+  t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+  t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+  t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+  t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+  t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+  t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+  t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+  t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+  t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+  t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+  t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+  t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+  t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+  t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
 
   // stage 2
   s_lo[0] = t_lo[0];
@@ -1062,45 +785,25 @@ static void fadst16_8col(int16x8_t *in) {
   s_lo[7] = t_lo[7];
   s_hi[7] = t_hi[7];
   // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s_lo[8] = vaddq_s32(vmulq_n_s32(t_lo[8], cospi_4_64),
-                      vmulq_n_s32(t_lo[9], cospi_28_64));
-  s_hi[8] = vaddq_s32(vmulq_n_s32(t_hi[8], cospi_4_64),
-                      vmulq_n_s32(t_hi[9], cospi_28_64));
   // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s_lo[9] = vsubq_s32(vmulq_n_s32(t_lo[8], cospi_28_64),
-                      vmulq_n_s32(t_lo[9], cospi_4_64));
-  s_hi[9] = vsubq_s32(vmulq_n_s32(t_hi[8], cospi_28_64),
-                      vmulq_n_s32(t_hi[9], cospi_4_64));
+  butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9],
+                                  cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8],
+                                  &s_lo[9], &s_hi[9]);
   // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s_lo[10] = vaddq_s32(vmulq_n_s32(t_lo[10], cospi_20_64),
-                       vmulq_n_s32(t_lo[11], cospi_12_64));
-  s_hi[10] = vaddq_s32(vmulq_n_s32(t_hi[10], cospi_20_64),
-                       vmulq_n_s32(t_hi[11], cospi_12_64));
   // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s_lo[11] = vsubq_s32(vmulq_n_s32(t_lo[10], cospi_12_64),
-                       vmulq_n_s32(t_lo[11], cospi_20_64));
-  s_hi[11] = vsubq_s32(vmulq_n_s32(t_hi[10], cospi_12_64),
-                       vmulq_n_s32(t_hi[11], cospi_20_64));
+  butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11],
+                                  cospi_20_64, cospi_12_64, &s_lo[10],
+                                  &s_hi[10], &s_lo[11], &s_hi[11]);
   // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], -cospi_28_64),
-                       vmulq_n_s32(t_lo[13], cospi_4_64));
-  s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], -cospi_28_64),
-                       vmulq_n_s32(t_hi[13], cospi_4_64));
   // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_4_64),
-                       vmulq_n_s32(t_lo[13], cospi_28_64));
-  s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_4_64),
-                       vmulq_n_s32(t_hi[13], cospi_28_64));
+  butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12],
+                                  cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13],
+                                  &s_lo[12], &s_hi[12]);
   // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_12_64),
-                       vmulq_n_s32(t_lo[15], cospi_20_64));
-  s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_12_64),
-                       vmulq_n_s32(t_hi[15], cospi_20_64));
   // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-  s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_20_64),
-                       vmulq_n_s32(t_lo[15], cospi_12_64));
-  s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_20_64),
-                       vmulq_n_s32(t_hi[15], cospi_12_64));
+  butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  cospi_12_64, cospi_20_64, &s_lo[15],
+                                  &s_hi[15], &s_lo[14], &s_hi[14]);
 
   // s0 + s4
   t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]);
@@ -1151,38 +854,22 @@ static void fadst16_8col(int16x8_t *in) {
   t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]);
   t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]);
 
-  t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
-  t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
-  t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
-  t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
-  t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
-  t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
-  t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
-  t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
-  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
-  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
-  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
-  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
-  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
-  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
-  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
-  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
-  t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
-  t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
-  t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
-  t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
-  t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
-  t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
-  t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
-  t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
-  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
-  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
-  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
-  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
-  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
-  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
-  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
-  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+  t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+  t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+  t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+  t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+  t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+  t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+  t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+  t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
 
   // stage 3
   s_lo[0] = t_lo[0];
@@ -1194,25 +881,15 @@ static void fadst16_8col(int16x8_t *in) {
   s_lo[3] = t_lo[3];
   s_hi[3] = t_hi[3];
   // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s_lo[4] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_8_64),
-                      vmulq_n_s32(t_lo[5], cospi_24_64));
-  s_hi[4] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_8_64),
-                      vmulq_n_s32(t_hi[5], cospi_24_64));
   // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s_lo[5] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_24_64),
-                      vmulq_n_s32(t_lo[5], -cospi_8_64));
-  s_hi[5] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_24_64),
-                      vmulq_n_s32(t_hi[5], -cospi_8_64));
+  butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+                                  cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+                                  &s_lo[5], &s_hi[5]);
   // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s_lo[6] = vaddq_s32(vmulq_n_s32(t_lo[6], -cospi_24_64),
-                      vmulq_n_s32(t_lo[7], cospi_8_64));
-  s_hi[6] = vaddq_s32(vmulq_n_s32(t_hi[6], -cospi_24_64),
-                      vmulq_n_s32(t_hi[7], cospi_8_64));
   // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s_lo[7] = vaddq_s32(vmulq_n_s32(t_lo[6], cospi_8_64),
-                      vmulq_n_s32(t_lo[7], cospi_24_64));
-  s_hi[7] = vaddq_s32(vmulq_n_s32(t_hi[6], cospi_8_64),
-                      vmulq_n_s32(t_hi[7], cospi_24_64));
+  butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                  cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7],
+                                  &s_lo[6], &s_hi[6]);
   s_lo[8] = t_lo[8];
   s_hi[8] = t_hi[8];
   s_lo[9] = t_lo[9];
@@ -1222,25 +899,15 @@ static void fadst16_8col(int16x8_t *in) {
   s_lo[11] = t_lo[11];
   s_hi[11] = t_hi[11];
   // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_8_64),
-                       vmulq_n_s32(t_lo[13], cospi_24_64));
-  s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_8_64),
-                       vmulq_n_s32(t_hi[13], cospi_24_64));
   // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_24_64),
-                       vmulq_n_s32(t_lo[13], -cospi_8_64));
-  s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_24_64),
-                       vmulq_n_s32(t_hi[13], -cospi_8_64));
+  butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13],
+                                  cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12],
+                                  &s_lo[13], &s_hi[13]);
   // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_24_64),
-                       vmulq_n_s32(t_lo[15], cospi_8_64));
-  s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_24_64),
-                       vmulq_n_s32(t_hi[15], cospi_8_64));
   // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-  s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_8_64),
-                       vmulq_n_s32(t_lo[15], cospi_24_64));
-  s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_8_64),
-                       vmulq_n_s32(t_hi[15], cospi_24_64));
+  butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15],
+                                  &s_lo[14], &s_hi[14]);
 
   // s0 + s4
   t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
@@ -1291,99 +958,62 @@ static void fadst16_8col(int16x8_t *in) {
   t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]);
   t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]);
 
-  t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
-  t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
-  t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
-  t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
-  t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
-  t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
-  t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
-  t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
-  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
-  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
-  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
-  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
-  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
-  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
-  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
-  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
-  t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
-  t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
-  t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
-  t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
-  t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
-  t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
-  t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
-  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
-  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
-  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
-  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
-  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
-  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
-  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
-  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
 
   // stage 4
   // s2 = (-cospi_16_64) * (x2 + x3);
-  s_lo[2] = vmulq_n_s32(vaddq_s32(t_lo[2], t_lo[3]), -cospi_16_64);
-  s_hi[2] = vmulq_n_s32(vaddq_s32(t_hi[2], t_hi[3]), -cospi_16_64);
   // s3 = cospi_16_64 * (x2 - x3);
-  s_lo[3] = vmulq_n_s32(vsubq_s32(t_lo[2], t_lo[3]), cospi_16_64);
-  s_hi[3] = vmulq_n_s32(vsubq_s32(t_hi[2], t_hi[3]), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+                                  -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+                                  &s_hi[3]);
   // s6 = cospi_16_64 * (x6 + x7);
-  s_lo[6] = vmulq_n_s32(vaddq_s32(t_lo[6], t_lo[7]), cospi_16_64);
-  s_hi[6] = vmulq_n_s32(vaddq_s32(t_hi[6], t_hi[7]), cospi_16_64);
   // s7 = cospi_16_64 * (-x6 + x7);
-  s_lo[7] = vmulq_n_s32(vsubq_s32(t_lo[7], t_lo[6]), cospi_16_64);
-  s_hi[7] = vmulq_n_s32(vsubq_s32(t_hi[7], t_hi[6]), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                  cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+                                  &s_hi[7]);
   // s10 = cospi_16_64 * (x10 + x11);
-  s_lo[10] = vmulq_n_s32(vaddq_s32(t_lo[10], t_lo[11]), cospi_16_64);
-  s_hi[10] = vmulq_n_s32(vaddq_s32(t_hi[10], t_hi[11]), cospi_16_64);
   // s11 = cospi_16_64 * (-x10 + x11);
-  s_lo[11] = vmulq_n_s32(vsubq_s32(t_lo[11], t_lo[10]), cospi_16_64);
-  s_hi[11] = vmulq_n_s32(vsubq_s32(t_hi[11], t_hi[10]), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+                                  cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11],
+                                  &s_hi[11]);
   // s14 = (-cospi_16_64) * (x14 + x15);
-  s_lo[14] = vmulq_n_s32(vaddq_s32(t_lo[14], t_lo[15]), -cospi_16_64);
-  s_hi[14] = vmulq_n_s32(vaddq_s32(t_hi[14], t_hi[15]), -cospi_16_64);
   // s15 = cospi_16_64 * (x14 - x15);
-  s_lo[15] = vmulq_n_s32(vsubq_s32(t_lo[14], t_lo[15]), cospi_16_64);
-  s_hi[15] = vmulq_n_s32(vsubq_s32(t_hi[14], t_hi[15]), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15],
+                                  &s_hi[15]);
 
   // final fdct_round_shift
-  t_lo[2] = vaddq_s32(s_lo[2], k__DCT_CONST_ROUNDING);
-  t_hi[2] = vaddq_s32(s_hi[2], k__DCT_CONST_ROUNDING);
-  t_lo[3] = vaddq_s32(s_lo[3], k__DCT_CONST_ROUNDING);
-  t_hi[3] = vaddq_s32(s_hi[3], k__DCT_CONST_ROUNDING);
-  t_lo[6] = vaddq_s32(s_lo[6], k__DCT_CONST_ROUNDING);
-  t_hi[6] = vaddq_s32(s_hi[6], k__DCT_CONST_ROUNDING);
-  t_lo[7] = vaddq_s32(s_lo[7], k__DCT_CONST_ROUNDING);
-  t_hi[7] = vaddq_s32(s_hi[7], k__DCT_CONST_ROUNDING);
-  t_lo[10] = vaddq_s32(s_lo[10], k__DCT_CONST_ROUNDING);
-  t_hi[10] = vaddq_s32(s_hi[10], k__DCT_CONST_ROUNDING);
-  t_lo[11] = vaddq_s32(s_lo[11], k__DCT_CONST_ROUNDING);
-  t_hi[11] = vaddq_s32(s_hi[11], k__DCT_CONST_ROUNDING);
-  t_lo[14] = vaddq_s32(s_lo[14], k__DCT_CONST_ROUNDING);
-  t_hi[14] = vaddq_s32(s_hi[14], k__DCT_CONST_ROUNDING);
-  t_lo[15] = vaddq_s32(s_lo[15], k__DCT_CONST_ROUNDING);
-  t_hi[15] = vaddq_s32(s_hi[15], k__DCT_CONST_ROUNDING);
-
-  x_lo[2] = vshrn_n_s32(t_lo[2], DCT_CONST_BITS);
-  x_hi[2] = vshrn_n_s32(t_hi[2], DCT_CONST_BITS);
-  x_lo[3] = vshrn_n_s32(t_lo[3], DCT_CONST_BITS);
-  x_hi[3] = vshrn_n_s32(t_hi[3], DCT_CONST_BITS);
-  x_lo[6] = vshrn_n_s32(t_lo[6], DCT_CONST_BITS);
-  x_hi[6] = vshrn_n_s32(t_hi[6], DCT_CONST_BITS);
-  x_lo[7] = vshrn_n_s32(t_lo[7], DCT_CONST_BITS);
-  x_hi[7] = vshrn_n_s32(t_hi[7], DCT_CONST_BITS);
-  x_lo[10] = vshrn_n_s32(t_lo[10], DCT_CONST_BITS);
-  x_hi[10] = vshrn_n_s32(t_hi[10], DCT_CONST_BITS);
-  x_lo[11] = vshrn_n_s32(t_lo[11], DCT_CONST_BITS);
-  x_hi[11] = vshrn_n_s32(t_hi[11], DCT_CONST_BITS);
-  x_lo[14] = vshrn_n_s32(t_lo[14], DCT_CONST_BITS);
-  x_hi[14] = vshrn_n_s32(t_hi[14], DCT_CONST_BITS);
-  x_lo[15] = vshrn_n_s32(t_lo[15], DCT_CONST_BITS);
-  x_hi[15] = vshrn_n_s32(t_hi[15], DCT_CONST_BITS);
+  x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+  x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS);
+  x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS);
+  x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS);
+  x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS);
+  x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS);
+  x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS);
+  x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS);
+  x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS);
 
   // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly
   x_lo[0] = vmovn_s32(t_lo[0]);
@@ -1465,3 +1095,137 @@ void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride,
       break;
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_load_buffer_4x4(const int16_t *input,
+                                          int32x4_t *in /*[4]*/, int stride) {
+  // { 0, 1, 1, 1 };
+  const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3);
+  // { 1, 0, 0, 0 };
+  const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3);
+  int32x4_t mask;
+
+  in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+  // one non-zero first elements
+  mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a));
+  in[0] = vaddq_s32(in[0], mask);
+  in[0] = vaddq_s32(in[0], nonzero_bias_b);
+}
+
+static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) {
+  const int32x4_t one = vdupq_n_s32(1);
+  res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2);
+  res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2);
+  res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2);
+  res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2);
+  vst1q_s32(output + 0 * 4, res[0]);
+  vst1q_s32(output + 1 * 4, res[1]);
+  vst1q_s32(output + 2 * 4, res[2]);
+  vst1q_s32(output + 3 * 4, res[3]);
+}
+
+static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) {
+  int32x2_t s_lo[4], s_hi[4];
+  int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4];
+
+  s_lo[0] = vget_low_s32(in[0]);
+  s_hi[0] = vget_high_s32(in[0]);
+  s_lo[1] = vget_low_s32(in[1]);
+  s_hi[1] = vget_high_s32(in[1]);
+  s_lo[2] = vget_low_s32(in[2]);
+  s_hi[2] = vget_high_s32(in[2]);
+  s_lo[3] = vget_low_s32(in[3]);
+  s_hi[3] = vget_high_s32(in[3]);
+
+  // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+  t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9);
+  t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9);
+  t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9);
+  t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9);
+  t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9);
+  t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9);
+
+  // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+  t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9);
+  t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9);
+  t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9);
+  t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9);
+  t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9);
+  t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9);
+
+  // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+  t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9);
+  t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9);
+  t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9);
+  t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9);
+  t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9);
+  t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9);
+
+  // t3 = s2 * sinpi_3_9
+  t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9);
+  t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9);
+
+  /*
+   * u0 = t0 + t3
+   * u1 = t1
+   * u2 = t2 - t3
+   * u3 = t2 - t0 + t3
+   */
+  u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]);
+  u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]);
+  u_lo[1] = t_lo[1];
+  u_hi[1] = t_hi[1];
+  u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]);
+  u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]);
+  u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]);
+  u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]);
+
+  // fdct_round_shift
+  in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[0], DCT_CONST_BITS));
+  in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[1], DCT_CONST_BITS));
+  in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[2], DCT_CONST_BITS));
+  in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[3], DCT_CONST_BITS));
+
+  transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+}
+
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  int32x4_t in[4];
+  // int i;
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_4x4(input, in, stride);
+      highbd_fadst4x4_neon(in);
+      vpx_highbd_fdct4x4_pass1_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_4x4(input, in, stride);
+      vpx_highbd_fdct4x4_pass1_neon(in);
+      highbd_fadst4x4_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_4x4(input, in, stride);
+      highbd_fadst4x4_neon(in);
+      highbd_fadst4x4_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index b33da427b4..193594e3dc 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -130,6 +130,24 @@ static INLINE void butterfly_one_coeff_s16_s32_narrow(
   *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
 }
 
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant);
+  *add_lo = vmlaq_n_s32(a1, b_lo, constant);
+  *add_hi = vmlaq_n_s32(a2, b_hi, constant);
+  *sub_lo = vmlsq_n_s32(a3, b_lo, constant);
+  *sub_hi = vmlsq_n_s32(a4, b_hi, constant);
+}
+
 // fdct_round_shift((a +/- b) * c)
 // Variant that performs fast vqrdmulhq_s32 operation on full vector
 // more accurate does 32-bit processing, takes and returns 32-bit values,
@@ -234,6 +252,44 @@ static INLINE void butterfly_two_coeff_s32_s64_narrow(
                          vrshrn_n_s64(diff[3], DCT_CONST_BITS));
 }
 
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s16_s32_noround(
+    const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo,
+    const int16x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmull_n_s16(a_lo, constant1);
+  const int32x4_t a2 = vmull_n_s16(a_hi, constant1);
+  const int32x4_t a3 = vmull_n_s16(a_lo, constant2);
+  const int32x4_t a4 = vmull_n_s16(a_hi, constant2);
+  *add_lo = vmlal_n_s16(a1, b_lo, constant2);
+  *add_hi = vmlal_n_s16(a2, b_hi, constant2);
+  *sub_lo = vmlsl_n_s16(a3, b_lo, constant1);
+  *sub_hi = vmlsl_n_s16(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  *add_lo = vmlaq_n_s32(a1, b_lo, constant2);
+  *add_hi = vmlaq_n_s32(a2, b_hi, constant2);
+  *sub_lo = vmlsq_n_s32(a3, b_lo, constant1);
+  *sub_hi = vmlsq_n_s32(a4, b_hi, constant1);
+}
+
 // fdct_round_shift(a * c1 +/- b * c2)
 // Variant that performs normal implementation on half vector
 // more accurate does 32-bit processing, takes and returns 16-bit values

From aeb6ae7393f09e66478f5b800ea989ae95a85e98 Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Mon, 14 Nov 2022 17:59:45 +0900
Subject: [PATCH 473/926] quantize: remove vp9_regular_quantize_b_4x4

This was just a helper function which called vpx_quantize_b or
vpx_highbd_quantize_b. It also checked for skip_block, which was
necessary when webm:1439 was filed but does not appear to be
necessary now.

Removes a quantize variant and makes subsequent cleanups easier.

Change-Id: Ibe545eccd19370f07ff26c8e151f290c642efd2a
---
 vp9/encoder/vp9_quantize.c | 28 -------------------------
 vp9/encoder/vp9_quantize.h |  3 ---
 vp9/encoder/vp9_rdopt.c    | 43 +++++++++++++++++++++++++++++++-------
 3 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 9058997b0f..dcc44449fd 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -149,34 +149,6 @@ void vp9_highbd_quantize_fp_32x32_c(
 }
 #endif
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block),
-             *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int n_coeffs = 4 * 4;
-
-  if (x->skip_block) {
-    memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff));
-    memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff));
-    return;
-  }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin,
-                          p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
-                          pd->dequant, &p->eobs[block], scan, iscan);
-    return;
-  }
-#endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round,
-                 p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                 &p->eobs[block], scan, iscan);
-}
-
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
   int l, m;
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 2e6d7da2b6..f626f06566 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -37,9 +37,6 @@ typedef struct {
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan);
-
 struct VP9_COMP;
 struct VP9Common;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index bfde5ab1a5..a464ce38f1 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1108,6 +1108,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
 
   xd->mi[0]->tx_size = TX_4X4;
 
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
@@ -1135,7 +1137,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
           int16_t *const src_diff =
               vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+          tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+          tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+          uint16_t *const eob = &p->eobs[block];
           xd->mi[0]->bmi[block].as_mode = mode;
           vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                   x->skip_encode ? src : dst,
@@ -1148,7 +1153,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             vp9_highbd_fwht4x4(src_diff, coeff, 8);
-            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                                  p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so->scan, so->iscan);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
@@ -1166,7 +1173,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
               vpx_highbd_fdct4x4(src_diff, coeff, 8);
             else
               vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
-            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                                  p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so->scan, so->iscan);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             distortion += vp9_highbd_block_error_dispatch(
@@ -1236,7 +1245,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
         int16_t *const src_diff =
             vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
         xd->mi[0]->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride, dst,
@@ -1248,7 +1260,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fwht4x4(src_diff, coeff, 8);
-          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                         so->scan, so->iscan);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1263,7 +1277,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                         so->scan, so->iscan);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1640,6 +1656,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
   const int is_compound = has_second_ref(mi);
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
 
+  assert(!x->skip_block);
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const int bw = b_width_log2_lookup[BLOCK_8X8];
     const int h = 4 * (i >> bw);
@@ -1701,18 +1719,27 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
 #endif
       int64_t ssz, rd, rd1, rd2;
-      tran_low_t *coeff;
+      tran_low_t *coeff, *qcoeff, *dqcoeff;
+      uint16_t *eob;
       int coeff_ctx;
       k += (idy * 2 + idx);
       coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
       coeff = BLOCK_OFFSET(p->coeff, k);
+      qcoeff = BLOCK_OFFSET(p->qcoeff, k);
+      dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k);
+      eob = &p->eobs[k];
+
       x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                      coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
+      vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                            p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                            so->scan, so->iscan);
       thisdistortion += vp9_highbd_block_error_dispatch(
           coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
 #else
+      vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift,
+                     qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan);
       thisdistortion +=
           vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

From 76e9bf7a184eb1caf979dd07e1107e3b74ac10b6 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 14 Nov 2022 22:11:19 -0800
Subject: [PATCH 474/926] vp9-svc: Fixes to make SVC work with VBR

Prior to this CL SVC with VBR mode was broken.
Fixes made here to make VBR rate control work for SVC.
Rename is_one_pass_cbr_svc() --> is_one_pass_svc(),
as it can be used now for both CBR and VBR.

Added rate targetting unittest for (2SL, 3TL).

Bug: chromium:1375111
Change-Id: I5a62ffe7fbea29dc5949c88a284768386b1907a9
---
 test/svc_datarate_test.cc          | 31 ++++++++++++++++++
 vp9/encoder/vp9_aq_cyclicrefresh.c |  2 +-
 vp9/encoder/vp9_encodeframe.c      |  6 ++--
 vp9/encoder/vp9_encoder.c          | 38 +++++++++++------------
 vp9/encoder/vp9_encoder.h          |  2 +-
 vp9/encoder/vp9_ratectrl.c         | 50 ++++++++++++++++++++++++------
 vp9/encoder/vp9_svc_layercontext.c |  6 ++--
 vp9/encoder/vp9_svc_layercontext.h |  2 +-
 vp9/vp9_cx_iface.c                 |  5 ++-
 9 files changed, 101 insertions(+), 41 deletions(-)

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 010c273421..484252ca43 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -1203,6 +1203,37 @@ TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) {
 #endif
 }
 
+// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and
+// 3 temporal layers. Run VGA clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_VBR;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)];
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
+                          1.3);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
 // Params: speed setting, layer framedrop control and index for bitrate array.
 class DatarateOnePassCbrSvcFrameDropMultiBR
     : public DatarateOnePassCbrSvc,
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 90792aebea..28ab10a13b 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -558,7 +558,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->percent_refresh = 10;
     cr->rate_ratio_qdelta = 1.5;
     cr->rate_boost_fac = 10;
-    if (cpi->refresh_golden_frame == 1) {
+    if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) {
       cr->percent_refresh = 0;
       cr->rate_ratio_qdelta = 1.0;
     }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a9f392bf51..a1ee9c6784 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1299,7 +1299,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
   int is_key_frame =
       (frame_is_intra_only(cm) ||
-       (is_one_pass_cbr_svc(cpi) &&
+       (is_one_pass_svc(cpi) &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
   // Always use 4x4 partition for key frame.
   const int use_4x4_partition = frame_is_intra_only(cm);
@@ -1406,7 +1406,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 
     assert(yv12 != NULL);
 
-    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+    if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
         cpi->svc.use_gf_temporal_ref_current_layer) {
       // For now, GOLDEN will not be used for non-zero spatial layers, since
       // it may not be a temporal reference.
@@ -5381,7 +5381,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
 
     assert(yv12 != NULL);
 
-    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+    if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
         cpi->svc.use_gf_temporal_ref_current_layer) {
       // For now, GOLDEN will not be used for non-zero spatial layers, since
       // it may not be a temporal reference.
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index ca3439d7c0..87c5d7b67f 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1333,7 +1333,7 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
   // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
   // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
   // target of 1/4x1/4. number_spatial_layers must be greater than 2.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+  if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
       cpi->svc.number_spatial_layers > 2) {
     cpi->svc.scaled_temp_is_alloc = 1;
     if (vpx_realloc_frame_buffer(
@@ -1511,7 +1511,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   // Temporal scalability.
   cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
 
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+  if ((cpi->svc.number_temporal_layers > 1) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
        cpi->oxcf.pass != 1)) {
@@ -2077,7 +2077,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     rc->rc_2_frame = 0;
   }
 
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+  if ((cpi->svc.number_temporal_layers > 1) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
        cpi->oxcf.pass != 1)) {
@@ -3263,7 +3263,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
   vp9_denoiser_update_ref_frame(cpi);
 #endif
 
-  if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi);
+  if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi);
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
@@ -3857,11 +3857,11 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   int q = 0, bottom_index = 0, top_index = 0;
   int no_drop_scene_change = 0;
   const INTERP_FILTER filter_scaler =
-      (is_one_pass_cbr_svc(cpi))
+      (is_one_pass_svc(cpi))
           ? svc->downsample_filter_type[svc->spatial_layer_id]
           : EIGHTTAP;
   const int phase_scaler =
-      (is_one_pass_cbr_svc(cpi))
+      (is_one_pass_svc(cpi))
           ? svc->downsample_filter_phase[svc->spatial_layer_id]
           : 0;
 
@@ -3882,7 +3882,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   set_frame_size(cpi);
 
-  if (is_one_pass_cbr_svc(cpi) &&
+  if (is_one_pass_svc(cpi) &&
       cpi->un_scaled_source->y_width == cm->width << 2 &&
       cpi->un_scaled_source->y_height == cm->height << 2 &&
       svc->scaled_temp.y_width == cm->width << 1 &&
@@ -3896,7 +3896,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
         cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
         filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
     svc->scaled_one_half = 1;
-  } else if (is_one_pass_cbr_svc(cpi) &&
+  } else if (is_one_pass_svc(cpi) &&
              cpi->un_scaled_source->y_width == cm->width << 1 &&
              cpi->un_scaled_source->y_height == cm->height << 1 &&
              svc->scaled_one_half) {
@@ -3911,7 +3911,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   }
 #ifdef OUTPUT_YUV_SVC_SRC
   // Write out at most 3 spatial layers.
-  if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) {
+  if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) {
     vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source);
   }
 #endif
@@ -4020,14 +4020,14 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     if (vp9_rc_drop_frame(cpi)) return 0;
   }
 
-  // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
+  // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame
   // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
   // avoid this frame-level upsampling (for non intra_only frames).
   // For SVC single_layer mode, dynamic resize is allowed and we need to
   // scale references for this case.
   if (frame_is_intra_only(cm) == 0 &&
       ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) ||
-       !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
+       !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
     vp9_scale_references(cpi);
   }
 
@@ -7613,8 +7613,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   const int gf_group_index = cpi->twopass.gf_group.index;
   int i;
 
-  if (is_one_pass_cbr_svc(cpi)) {
-    vp9_one_pass_cbr_svc_start_layer(cpi);
+  if (is_one_pass_svc(cpi)) {
+    vp9_one_pass_svc_start_layer(cpi);
   }
 
   vpx_usec_timer_start(&cmptimer);
@@ -7634,7 +7634,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // Normal defaults
   cm->reset_frame_context = 0;
   cm->refresh_frame_context = 1;
-  if (!is_one_pass_cbr_svc(cpi)) {
+  if (!is_one_pass_svc(cpi)) {
     cpi->refresh_last_frame = 1;
     cpi->refresh_golden_frame = 0;
     cpi->refresh_alt_ref_frame = 0;
@@ -7767,7 +7767,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       adjust_frame_rate(cpi, source);
   }
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     vp9_update_temporal_layer_framerate(cpi);
     vp9_restore_layer_context(cpi);
   }
@@ -7901,9 +7901,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   // Save layer specific state.
-  if (is_one_pass_cbr_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
-                                    cpi->svc.number_spatial_layers > 1) &&
-                                   oxcf->pass == 2)) {
+  if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
+                                cpi->svc.number_spatial_layers > 1) &&
+                               oxcf->pass == 2)) {
     vp9_save_layer_context(cpi);
   }
 
@@ -8077,7 +8077,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #endif
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     if (cm->show_frame) {
       ++cpi->svc.spatial_layer_to_encode;
       if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1d58945250..3e0b80677e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1305,7 +1305,7 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
-static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
+static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) {
   return (cpi->use_svc && cpi->oxcf.pass == 0);
 }
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1ddf64d41a..d9207f7a2f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -327,7 +327,7 @@ static void update_buffer_level_postencode(VP9_COMP *cpi,
 
   rc->buffer_level = rc->bits_off_target;
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size);
   }
 }
@@ -910,7 +910,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1;
   } else {
-    if (!rc->is_src_frame_alt_ref &&
+    if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
       active_worst_quality =
           curr_frame == 1
@@ -1871,7 +1871,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
       }
     }
   } else {
-    if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) ||
+    if ((cpi->use_svc) ||
         (!rc->is_src_frame_alt_ref &&
          !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
       rc->last_q[INTER_FRAME] = qindex;
@@ -2021,6 +2021,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
                 (rc->baseline_gf_interval + af_ratio - 1)
           : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
                 (rc->baseline_gf_interval + af_ratio - 1);
+  // For SVC: refresh flags are used to define the pattern, so we can't
+  // use that for boosting the target size here.
+  // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC.
+  // For now just use the CBR logic for setting target size.
+  if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
   if (target > INT_MAX) target = INT_MAX;
   return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
 }
@@ -2147,7 +2152,7 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   } else {
     target = rc->avg_frame_bandwidth;
   }
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -2282,7 +2287,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
       (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
-    if (is_one_pass_cbr_svc(cpi)) {
+    if (is_one_pass_svc(cpi)) {
       if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1);
       layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
                                svc->number_temporal_layers);
@@ -2290,11 +2295,14 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
       // Assumption here is that LAST_FRAME is being updated for a keyframe.
       // Thus no change in update flags.
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR)
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      else
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
     }
   } else {
     cm->frame_type = INTER_FRAME;
-    if (is_one_pass_cbr_svc(cpi)) {
+    if (is_one_pass_svc(cpi)) {
       LAYER_CONTEXT *lc = &svc->layer_context[layer];
       // Add condition current_video_frame > 0 for the case where first frame
       // is intra only followed by overlay/copy frame. In this case we don't
@@ -2303,7 +2311,23 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
           (svc->spatial_layer_id == 0 && cm->current_video_frame > 0)
               ? 0
               : svc->layer_context[svc->temporal_layer_id].is_key_frame;
-      target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR) {
+        target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+      } else {
+        double rate_err = 0.0;
+        rc->fac_active_worst_inter = 140;
+        rc->fac_active_worst_gf = 100;
+        if (rc->rolling_target_bits > 0) {
+          rate_err =
+              (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+          if (rate_err < 1.0)
+            rc->fac_active_worst_inter = 120;
+          else if (rate_err > 2.0)
+            // Increase active_worst faster if rate fluctuation is high.
+            rc->fac_active_worst_inter = 160;
+        }
+        target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
+      }
     }
   }
 
@@ -2312,7 +2336,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
         svc->layer_context[layer].is_key_frame == 1) {
       cm->frame_type = KEY_FRAME;
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR)
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      else
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
     }
     // Set the buffer idx and refresh flags for key frames in simulcast mode.
     // Note the buffer slot for long-term reference is set below (line 2255),
@@ -2397,7 +2424,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   }
   if (svc->set_intra_only_frame) {
     set_intra_only_frame(cpi);
-    target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+    if (cpi->oxcf.rc_mode == VPX_CBR)
+      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+    else
+      target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
   }
   // Overlay frame predicts from LAST (intra-only)
   if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index a57a70ab16..518c00b34a 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -290,7 +290,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
 }
 
 static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
-  if (is_one_pass_cbr_svc(cpi))
+  if (is_one_pass_svc(cpi))
     return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
                                        cpi->svc.number_temporal_layers +
                                    cpi->svc.temporal_layer_id];
@@ -354,7 +354,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   cpi->alt_ref_source = lc->alt_ref_source;
   // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode
   // does not use speed = 0).
-  if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) {
+  if (is_one_pass_svc(cpi) && lc->speed > 0) {
     cpi->oxcf.speed = lc->speed;
   }
   cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
@@ -754,7 +754,7 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
 }
 
-int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
+int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index b2d1d1b98f..c7328cf571 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -255,7 +255,7 @@ int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi);
 
 void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
 
-int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
+int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi);
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 02bd2e579b..695774e730 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1527,9 +1527,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
           cx_data += size;
           cx_data_sz -= size;
-          if (is_one_pass_cbr_svc(cpi) &&
-              (cpi->svc.spatial_layer_id ==
-               cpi->svc.number_spatial_layers - 1)) {
+          if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id ==
+                                       cpi->svc.number_spatial_layers - 1)) {
             // Encoded all spatial layers; exit loop.
             break;
           }

From 3fa698a6e855aad203517b8e71290b837ceda192 Mon Sep 17 00:00:00 2001
From: Hirokazu Honda <hiroh@chromium.org>
Date: Thu, 17 Nov 2022 16:05:28 +0900
Subject: [PATCH 475/926] vp9/rate_ctrl_rtc: Improve get cyclic refresh data

A client of the vp9 rate controller needs to know whether the
segmentation is enabled and the size of delta_q. It is also nicer to
know the size of map. This CL changes the interface to achieve these.

Bug: b:259487065
Test: Build

Change-Id: If05854530f97e1430a7b97788910f277ab673a87
---
 vp9/ratectrl_rtc.cc | 16 ++++++++++------
 vp9/ratectrl_rtc.h  | 10 ++++++++--
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index f4d7f7e9e7..a9287b5a3e 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -205,12 +205,16 @@ int VP9RateControlRTC::GetLoopfilterLevel() const {
   return lf->filter_level;
 }
 
-signed char *VP9RateControlRTC::GetCyclicRefreshMap() const {
-  return cpi_->cyclic_refresh->map;
-}
-
-int *VP9RateControlRTC::GetDeltaQ() const {
-  return cpi_->cyclic_refresh->qindex_delta;
+bool VP9RateControlRTC::GetSegmentationData(
+    VP9SegmentationData *segmentation_data) const {
+  if (!cpi_->cyclic_refresh->apply_cyclic_refresh) return false;
+
+  segmentation_data->segmentation_map = cpi_->segmentation_map;
+  segmentation_data->segmentation_map_size =
+      cpi_->common.mi_cols * cpi_->common.mi_rows;
+  segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta;
+  segmentation_data->delta_q_size = 3u;
+  return true;
 }
 
 void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index d2b9417aef..b209e4db66 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -58,6 +58,13 @@ struct VP9FrameParamsQpRTC {
   int temporal_layer_id;
 };
 
+struct VP9SegmentationData {
+  const uint8_t *segmentation_map;
+  size_t segmentation_map_size;
+  const int *delta_q;
+  size_t delta_q_size;
+};
+
 // This interface allows using VP9 real-time rate control without initializing
 // the encoder. To use this interface, you need to link with libvpxrc.a.
 //
@@ -110,8 +117,7 @@ class VP9RateControlRTC {
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
   int GetLoopfilterLevel() const;
-  signed char *GetCyclicRefreshMap() const;
-  int *GetDeltaQ() const;
+  bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
   void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
   void PostEncodeUpdate(uint64_t encoded_frame_size);

From 2a8a25cf447914515dd7c27030f39b1cc06234f3 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 5 Dec 2022 11:54:33 -0800
Subject: [PATCH 476/926] rc-rtc: Remove frame_flags_ change in svc ratectril
 rtc test

SVC test is only in CBR and the frame_flags are
set by the SVC pattern, so we shouldn't undo them
for svc mode.

Change-Id: I5ffa65dd58a7b47f287d124d9e71ba1dc7c5a549
---
 test/vp9_ratectrl_rtc_test.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 03a58fa926..1f429b193a 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -181,11 +181,6 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     }
 
     frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME;
-    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
-      // Disable golden frame update.
-      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
-      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
-    }
     encoder_exit_ = video->frame() == kNumFrames;
     current_superframe_ = video->frame();
   }

From cbb780ab0bd68dc60e822c3b2e51f37d2128e9cd Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 5 Dec 2022 14:30:40 -0800
Subject: [PATCH 477/926] rc-rtc: Test for periodic key in SVC external RC

This test catches the fix merged in here:
https://chromium-review.googlesource.com/c/webm/libvpx/+/4022904

Change-Id: Ib68fbcba694b5d465a9faf3ca7d6880bfe8eabb3
---
 test/vp9_ratectrl_rtc_test.cc | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 1f429b193a..931e68c880 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -179,8 +179,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
     }
-
-    frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME;
+    frame_params_.frame_type =
+        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
     encoder_exit_ = video->frame() == kNumFrames;
     current_superframe_ = video->frame();
   }
@@ -214,6 +214,21 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
 
   void RunSvc() {
     SetConfigSvc();
+    // kNumFrames = 300, so no key frames in this test.
+    key_interval_ = 10000;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderSvc();
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcPeriodicKey() {
+    SetConfigSvc();
+    // kNumFrames = 300, so 3 key frames in this test.
+    key_interval_ = 100;
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
     SetEncoderSvc();
 
@@ -297,6 +312,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     cfg_.layer_target_bitrate[6] = 450;
     cfg_.layer_target_bitrate[7] = 630;
     cfg_.layer_target_bitrate[8] = 900;
+
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
   }
 
   void SetConfigSvc() {
@@ -355,6 +373,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   bool encoder_exit_;
   int current_superframe_;
   uint32_t sizes_[8];
+  int key_interval_;
 };
 
 TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
@@ -363,6 +382,8 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
+TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
+
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));

From 5887bd234e5468be69f8e6e714623a152efeaf93 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Fri, 2 Dec 2022 18:04:32 -0800
Subject: [PATCH 478/926] L2E: Add a new interface to control rdmult

Allow external model to control frame rdmult.

A function is called per frame to get the value of rdmult from
the external model.

The external rdmult will overwrite libvpx's default rdmult unless
a reserved value is selected.

A unit test is added to test when the default rdmult value is set.

Change-Id: I2f17a036c188de66dc00709beef4bf2ed86a919a
---
 test/vp9_ext_ratectrl_test.cc  | 81 ++++++++++++++++++++++++++++++++++
 vp9/encoder/vp9_encoder.c      | 26 +++++++++++
 vp9/encoder/vp9_ext_ratectrl.c | 29 ++++++++++++
 vp9/encoder/vp9_ext_ratectrl.h |  7 +++
 vp9/encoder/vp9_rd.c           | 11 +++++
 vpx/vpx_ext_ratectrl.h         | 34 +++++++++++++-
 6 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 16e3248f76..2bfa6281d7 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -41,6 +41,7 @@ constexpr int kDefaultMaxGfInterval = 16;
 constexpr int kReadMinGfInterval = 5;
 constexpr int kReadMaxGfInterval = 13;
 const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
+const double kPsnrThreshold = 30.50;
 
 struct ToyRateCtrl {
   int magic_number;
@@ -642,6 +643,19 @@ vpx_rc_status_t rc_update_encodeframe_result_gop_short(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_get_default_frame_rdmult(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  *rdmult = VPX_DEFAULT_RDMULT;
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
@@ -880,4 +894,71 @@ TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
+class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
+                              public ::testing::Test {
+ protected:
+  ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestRdmult() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void BeginPassHook(unsigned int) override {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+};
+
+TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+
+  const double psnr = GetAveragePsnr();
+  EXPECT_GT(psnr, kPsnrThreshold);
+}
+
 }  // namespace
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 87c5d7b67f..5cfd846dd0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5515,6 +5515,32 @@ static void encode_frame_to_data_rate(
     save_encode_params(cpi);
   }
 #endif
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) {
+    vpx_codec_err_t codec_status;
+    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+    const int ref_frame_flags = get_ref_frame_flags(cpi);
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+    const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+    // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+    // index 1 refers to the first encoding frame in a gf group.
+    // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+    // See function define_gf_group_structure().
+    const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
+    int ext_rdmult = VPX_DEFAULT_RDMULT;
+    get_ref_frame_bufs(cpi, ref_frame_bufs);
+    codec_status = vp9_extrc_get_frame_rdmult(
+        &cpi->ext_ratectrl, curr_frame_buf->frame_index,
+        cm->current_frame_coding_index, gf_group->index, update_type,
+        gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+        &ext_rdmult);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_frame_rdmult() failed");
+    }
+    cpi->ext_ratectrl.ext_rdmult = ext_rdmult;
+  }
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     if (!encode_without_recode_loop(cpi, size, dest)) return;
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index b4ee574ff1..1d440442b5 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -230,3 +230,32 @@ vpx_codec_err_t vp9_extrc_get_gop_decision(
   }
   return VPX_CODEC_OK;
 }
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    int *rdmult) {
+  vpx_rc_status_t rc_status;
+  vpx_rc_encodeframe_info_t encode_frame_info;
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  encode_frame_info.show_index = show_index;
+  encode_frame_info.coding_index = coding_index;
+  encode_frame_info.gop_index = gop_index;
+  encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+  encode_frame_info.gop_size = gop_size;
+  encode_frame_info.use_alt_ref = use_alt_ref;
+
+  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+                         encode_frame_info.ref_frame_coding_indexes,
+                         encode_frame_info.ref_frame_valid_list);
+  rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model,
+                                                   &encode_frame_info, rdmult);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index b8f3d0c834..7c38758833 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -16,6 +16,7 @@
 
 typedef struct EXT_RATECTRL {
   int ready;
+  int ext_rdmult;
   vpx_rc_model_t model;
   vpx_rc_funcs_t funcs;
   vpx_rc_config_t ratectrl_config;
@@ -49,4 +50,10 @@ vpx_codec_err_t vp9_extrc_get_gop_decision(
     EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
     vpx_rc_gop_decision_t *gop_decision);
 
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    int *rdmult);
+
 #endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 28f992f4b6..58dd75b441 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -244,6 +244,12 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
   // largest dc_quant is 21387, therefore rdmult should fit in int32_t
   int rdmult = q * q;
 
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+    return cpi->ext_ratectrl.ext_rdmult;
+  }
+
   // Make sure this function is floating point safe.
   vpx_clear_system_state();
 
@@ -287,6 +293,11 @@ static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
 
 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
   int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+    return cpi->ext_ratectrl.ext_rdmult;
+  }
   return modulate_rdmult(cpi, rdmult);
 }
 
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 95b883413e..3c5fc8cfc3 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -25,20 +25,26 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (5)
+#define VPX_EXT_RATECTRL_ABI_VERSION (6)
 
 /*!\brief The control type of the inference API.
  * In VPX_RC_QP mode, the external rate control model determines the
  * quantization parameter (QP) for each frame.
  * In VPX_RC_GOP mode, the external rate control model determines the
  * group of picture (GOP) of the video sequence.
+ * In VPX_RC_RDMULT mode, the external rate control model determines the
+ * rate-distortion multiplier (rdmult) for the current frame.
  * In VPX_RC_GOP_QP mode, the external rate control model determines
  * both the QP and the GOP.
+ * In VPX_RC_GOP_QP_RDMULT mode, the external rate control model determines
+ * the QP, GOP and the rdmult.
  */
 typedef enum vpx_rc_type {
   VPX_RC_QP = 1 << 0,
   VPX_RC_GOP = 1 << 1,
-  VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP
+  VPX_RC_RDMULT = 1 << 2,
+  VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP,
+  VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT
 } vpx_rc_type_t;
 
 /*!\brief Abstract rate control model handler
@@ -55,6 +61,13 @@ typedef void *vpx_rc_model_t;
  */
 #define VPX_DEFAULT_Q -1
 
+/*!\brief A reserved value for the rdmult.
+ * If the external rate control model returns this value,
+ * the encoder will use the default rdmult selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_RDMULT -1
+
 /*!\brief Encode frame decision made by the external rate control model
  *
  * The encoder will receive the decision from the external rate control model
@@ -432,6 +445,19 @@ typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
     vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
     vpx_rc_gop_decision_t *gop_decision);
 
+/*!\brief Get the frame rdmult from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get rdmult from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  frame_info       information collected from the encoder
+ * \param[out] rdmult           frame rate-distortion multiplier from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info,
+    int *rdmult);
+
 /*!\brief Delete the external rate control model callback prototype
  *
  * This callback is invoked by the encoder to delete the external rate control
@@ -473,6 +499,10 @@ typedef struct vpx_rc_funcs {
    * Get GOP decisions from the external rate control model.
    */
   vpx_rc_get_gop_decision_cb_fn_t get_gop_decision;
+  /*!
+   * Get rdmult for the frame from the external rate control model.
+   */
+  vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult;
   /*!
    * Delete the external rate control model.
    */

From 1450ec46e273b32234b036d7803aaae09423dd08 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Tue, 6 Dec 2022 14:18:03 -0800
Subject: [PATCH 479/926] Add vpx highbd subtract test.

Change-Id: I069ae0fe22bfc82ad5083df85a7fdf9058a285eb
---
 test/vp9_subtract_test.cc | 148 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index f634a032d8..7c69a317cc 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -17,9 +18,11 @@
 #include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/util.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_ports/msvc.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                              ptrdiff_t diff_stride, const uint8_t *src_ptr,
@@ -161,4 +164,149 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_lsx));
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+                                ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                                ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                                ptrdiff_t pred_stride, int bd);
+
+// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
+using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>;
+
+class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  virtual void SetUp() {
+    block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)];
+    block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)];
+    bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1));
+    func_ = GET_PARAM(2);
+    ref_func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    constexpr size_t kMaxWidth = 128;
+    constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+    ASSERT_NE(src_, nullptr);
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+    ASSERT_NE(pred_, nullptr);
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(int16_t)));
+    ASSERT_NE(diff_, nullptr);
+  }
+
+  virtual void TearDown() {
+    vpx_free(CONVERT_TO_SHORTPTR(src_));
+    vpx_free(CONVERT_TO_SHORTPTR(pred_));
+    vpx_free(diff_);
+  }
+
+ protected:
+  void CheckResult();
+  void RunForSpeed();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  vpx_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  HBDSubtractFunc ref_func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void VPXHBDSubtractBlockTest::CheckResult() {
+  constexpr int kTestNum = 100;
+  constexpr int kMaxWidth = 128;
+  constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+  const int mask = (1 << bit_depth_) - 1;
+  for (int i = 0; i < kTestNum; ++i) {
+    for (int j = 0; j < kMaxBlockSize; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); }
+
+void VPXHBDSubtractBlockTest::RunForSpeed() {
+  constexpr int kTestNum = 200000;
+  constexpr int kMaxWidth = 128;
+  constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+  const int mask = (1 << bit_depth_) - 1;
+
+  if (ref_func_ == func_) GTEST_SKIP();
+
+  for (int j = 0; j < kMaxBlockSize; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  vpx_usec_timer ref_timer;
+  vpx_usec_timer_start(&ref_timer);
+  for (int i = 0; i < kTestNum; ++i) {
+    ref_func_(block_height_, block_width_, diff_, block_width_, src_,
+              block_width_, pred_, block_width_, bit_depth_);
+  }
+  vpx_usec_timer_mark(&ref_timer);
+  const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer);
+
+  for (int j = 0; j < kMaxBlockSize; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kTestNum; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer);
+
+  printf(
+      "[%dx%d]: "
+      "ref_time=%6" PRId64 " \t simd_time=%6" PRId64
+      " \t "
+      "gain=%f \n",
+      block_width_, block_height_, ref_elapsed_time, elapsed_time,
+      static_cast<double>(ref_elapsed_time) /
+          static_cast<double>(elapsed_time));
+}
+
+TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+                                       BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+                                       BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+                                       BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+                                       BLOCK_64X64 };
+
+INSTANTIATE_TEST_SUITE_P(
+    C, VPXHBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&vpx_highbd_subtract_block_c),
+                       ::testing::Values(&vpx_highbd_subtract_block_c)));
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace vp9

From a7bb04b43598cb2dcefea2352a1fde4dbd269fe5 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Tue, 6 Dec 2022 13:13:30 -0800
Subject: [PATCH 480/926] [x86]: Add vpx_highbd_subtract_block_avx2().

Up to 4x faster than "sse2 vectorized C".

Change-Id: Ie9b3c12a437c5cddf92c4d5349c4f659ca6b82ea
---
 test/vp9_subtract_test.cc    |   9 +++
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 vpx_dsp/x86/subtract_avx2.c  | 107 +++++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index 7c69a317cc..a57082f1eb 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -308,5 +308,14 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(&vpx_highbd_subtract_block_c),
                        ::testing::Values(&vpx_highbd_subtract_block_c)));
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VPXHBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&vpx_highbd_subtract_block_avx2),
+                       ::testing::Values(&vpx_highbd_subtract_block_c)));
+#endif  // HAVE_AVX2
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace vp9
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 51f5ebedd6..b6d656820f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -939,7 +939,7 @@ ()
   # Block subtraction
   #
   add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vpx_highbd_subtract_block neon/;
+  specialize qw/vpx_highbd_subtract_block neon avx2/;
 
   #
   # Single block SAD
diff --git a/vpx_dsp/x86/subtract_avx2.c b/vpx_dsp/x86/subtract_avx2.c
index 4d259ef5c5..4849581ed4 100644
--- a/vpx_dsp/x86/subtract_avx2.c
+++ b/vpx_dsp/x86/subtract_avx2.c
@@ -94,3 +94,110 @@ void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
       break;
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                                    ptrdiff_t diff_stride,
+                                    const uint8_t *src8_ptr,
+                                    ptrdiff_t src_stride,
+                                    const uint8_t *pred8_ptr,
+                                    ptrdiff_t pred_stride, int bd) {
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+  (void)bd;
+  if (cols == 64) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+      const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+      const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+      const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+      const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      const __m256i d2 = _mm256_sub_epi16(s2, p2);
+      const __m256i d3 = _mm256_sub_epi16(s3, p3);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+      src_ptr += src_stride;
+      pred_ptr += pred_stride;
+      diff_ptr += diff_stride;
+    } while (--j != 0);
+  } else if (cols == 32) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+      src_ptr += src_stride;
+      pred_ptr += pred_stride;
+      diff_ptr += diff_stride;
+    } while (--j != 0);
+  } else if (cols == 16) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 =
+          _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 =
+          _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  } else if (cols == 8) {
+    int j = rows;
+    do {
+      const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+      const __m128i s1 =
+          _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+      const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+      const __m128i p1 =
+          _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+      const __m128i d0 = _mm_sub_epi16(s0, p0);
+      const __m128i d1 = _mm_sub_epi16(s1, p1);
+      _mm_storeu_si128((__m128i *)diff_ptr, d0);
+      _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  } else {
+    int j = rows;
+    assert(cols == 4);
+    do {
+      const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+      const __m128i s1 =
+          _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+      const __m128i p1 =
+          _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+      const __m128i d0 = _mm_sub_epi16(s0, p0);
+      const __m128i d1 = _mm_sub_epi16(s1, p1);
+      _mm_storel_epi64((__m128i *)diff_ptr, d0);
+      _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

From 89b8032ff5cbe1d0043587f2df7f5a5e858e6fdb Mon Sep 17 00:00:00 2001
From: Anton Venema <anton.venema@liveswitch.com>
Date: Tue, 13 Dec 2022 10:27:37 -0800
Subject: [PATCH 481/926] Add additional ARM targets for Visual Studio.

configure: Add an armv7-win32-vs16 target
configure: Add an armv7-win32-vs17 target
configure: Add an arm64-win64-vs16 target
configure: Add an arm64-win64-vs17 target

Change-Id: I11d6cd6e51f7703939d6fd3fc6a7469591e3b09d
---
 AUTHORS   | 1 +
 configure | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/AUTHORS b/AUTHORS
index fffda63360..536e0e7cf0 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -23,6 +23,7 @@ Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
+Anton Venema <anton.venema@liveswitch.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Birk Magnussen <birk.magnussen@googlemail.com>
diff --git a/configure b/configure
index bf92e1ad1f..ae289f77b4 100755
--- a/configure
+++ b/configure
@@ -105,6 +105,8 @@ all_platforms="${all_platforms} arm64-darwin22-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
+all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs17"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -113,6 +115,8 @@ all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-gcc"
 all_platforms="${all_platforms} armv7-win32-vs14"
 all_platforms="${all_platforms} armv7-win32-vs15"
+all_platforms="${all_platforms} armv7-win32-vs16"
+all_platforms="${all_platforms} armv7-win32-vs17"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} loongarch32-linux-gcc"

From 55d3184503edf36e01bbefd6a09a602b02f60986 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 7 Dec 2022 00:17:22 -0800
Subject: [PATCH 482/926] rc-svc: Add tests for dynamic svc in external RC

Test to verify RC for going down and back up in
spatial layers. Going back up has an issue so added
a TODO.

Make the test more flexible to handle dynamic layers.
Test for dyanmic change in temporal layers to follow.

Change-Id: Ic5542f7b274135277429e116f56ba54e682e96a0
---
 test/vp9_ratectrl_rtc_test.cc | 246 +++++++++++++++++++++++++---------
 1 file changed, 186 insertions(+), 60 deletions(-)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 931e68c880..1d1a78f43d 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -26,7 +26,11 @@ namespace {
 
 const size_t kNumFrames = 300;
 
-const int kTemporalId[4] = { 0, 2, 1, 2 };
+const int kTemporalId3Layer[4] = { 0, 2, 1, 2 };
+const int kTemporalId2Layer[2] = { 0, 1 };
+const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 };
+const int kTemporalRateAllocation2Layer[2] = { 60, 100 };
+const int kSpatialLayerBitrate[3] = { 200, 400, 1000 };
 
 class RcInterfaceTest
     : public ::libvpx_test::EncoderTest,
@@ -183,19 +187,69 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
         video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
     encoder_exit_ = video->frame() == kNumFrames;
     current_superframe_ = video->frame();
+    if (dynamic_spatial_layers_ == 1) {
+      if (video->frame() == 100) {
+        // Go down to 2 spatial layers: set top SL to 0 bitrate.
+        // Update the encoder config.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8];
+        cfg_.layer_target_bitrate[6] = 0;
+        cfg_.layer_target_bitrate[7] = 0;
+        cfg_.layer_target_bitrate[8] = 0;
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8];
+        rc_cfg_.layer_target_bitrate[6] = 0;
+        rc_cfg_.layer_target_bitrate[7] = 0;
+        rc_cfg_.layer_target_bitrate[8] = 0;
+        rc_api_->UpdateRateControl(rc_cfg_);
+      } else if (video->frame() == 200) {
+        // Go down to 1 spatial layer.
+        // Update the encoder config.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5];
+        cfg_.layer_target_bitrate[3] = 0;
+        cfg_.layer_target_bitrate[4] = 0;
+        cfg_.layer_target_bitrate[5] = 0;
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5];
+        rc_cfg_.layer_target_bitrate[3] = 0;
+        rc_cfg_.layer_target_bitrate[4] = 0;
+        rc_cfg_.layer_target_bitrate[5] = 0;
+        rc_api_->UpdateRateControl(rc_cfg_);
+      } else if (0 && video->frame() == 280) {
+        // TODO(marpan): Re-enable this going back up when issue is fixed.
+        // Go back up to 3 spatial layers.
+        // Update the encoder config: use the original bitrates.
+        SetEncoderConfigSvc(3, 3);
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        SetRCConfigSvc(3, 3);
+        rc_api_->UpdateRateControl(rc_cfg_);
+      }
+    }
   }
 
   virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+    for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
       ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
                            pkt->data.frame.sz);
       for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
-        frame_params_.spatial_layer_id = sl;
-        frame_params_.temporal_layer_id = kTemporalId[current_superframe_ % 4];
-        rc_api_->ComputeQP(frame_params_);
-        frame_params_.frame_type = INTER_FRAME;
-        rc_api_->PostEncodeUpdate(sizes_[sl]);
+        if (sizes_[sl] > 0) {
+          frame_params_.spatial_layer_id = sl;
+          if (rc_cfg_.ts_number_layers == 3)
+            frame_params_.temporal_layer_id =
+                kTemporalId3Layer[current_superframe_ % 4];
+          else if (rc_cfg_.ts_number_layers == 2)
+            frame_params_.temporal_layer_id =
+                kTemporalId2Layer[current_superframe_ % 2];
+          else
+            frame_params_.temporal_layer_id = 0;
+          rc_api_->ComputeQP(frame_params_);
+          frame_params_.frame_type = INTER_FRAME;
+          rc_api_->PostEncodeUpdate(sizes_[sl]);
+        }
       }
     }
     if (!encoder_exit_) {
@@ -213,11 +267,11 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
                             const vpx_image_t * /*img2*/) {}
 
   void RunSvc() {
-    SetConfigSvc();
-    // kNumFrames = 300, so no key frames in this test.
+    dynamic_spatial_layers_ = 0;
+    SetRCConfigSvc(3, 3);
     key_interval_ = 10000;
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    SetEncoderSvc();
+    SetEncoderConfigSvc(3, 3);
 
     ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
                                          1280, 720, 30, 1, 0, kNumFrames);
@@ -226,11 +280,24 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   }
 
   void RunSvcPeriodicKey() {
-    SetConfigSvc();
-    // kNumFrames = 300, so 3 key frames in this test.
+    dynamic_spatial_layers_ = 0;
+    SetRCConfigSvc(3, 3);
     key_interval_ = 100;
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    SetEncoderSvc();
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcDynamicSpatial() {
+    dynamic_spatial_layers_ = 1;
+    SetRCConfigSvc(3, 3);
+    key_interval_ = 10000;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
 
     ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
                                          1280, 720, 30, 1, 0, kNumFrames);
@@ -266,17 +333,31 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     return VPX_CODEC_OK;
   }
 
-  void SetEncoderSvc() {
-    cfg_.ss_number_layers = 3;
-    cfg_.ts_number_layers = 3;
+  void SetEncoderConfigSvc(int number_spatial_layers,
+                           int number_temporal_layers) {
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.ss_number_layers = number_spatial_layers;
+    cfg_.ts_number_layers = number_temporal_layers;
     cfg_.g_timebase.num = 1;
     cfg_.g_timebase.den = 30;
-    svc_params_.scaling_factor_num[0] = 72;
-    svc_params_.scaling_factor_den[0] = 288;
-    svc_params_.scaling_factor_num[1] = 144;
-    svc_params_.scaling_factor_den[1] = 288;
-    svc_params_.scaling_factor_num[2] = 288;
-    svc_params_.scaling_factor_den[2] = 288;
+    if (number_spatial_layers == 3) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 4;
+      svc_params_.scaling_factor_num[1] = 2;
+      svc_params_.scaling_factor_den[1] = 4;
+      svc_params_.scaling_factor_num[2] = 4;
+      svc_params_.scaling_factor_den[2] = 4;
+    } else if (number_spatial_layers == 2) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 2;
+      svc_params_.scaling_factor_num[1] = 2;
+      svc_params_.scaling_factor_den[1] = 2;
+    } else if (number_spatial_layers == 1) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 1;
+    }
+
     for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
       svc_params_.max_quantizers[i] = 56;
       svc_params_.min_quantizers[i] = 2;
@@ -286,11 +367,20 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     cfg_.rc_end_usage = VPX_CBR;
     cfg_.g_lag_in_frames = 0;
     cfg_.g_error_resilient = 0;
-    // 3 temporal layers
-    cfg_.ts_rate_decimator[0] = 4;
-    cfg_.ts_rate_decimator[1] = 2;
-    cfg_.ts_rate_decimator[2] = 1;
-    cfg_.temporal_layering_mode = 3;
+
+    if (number_temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.temporal_layering_mode = 3;
+    } else if (number_temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.temporal_layering_mode = 2;
+    } else if (number_temporal_layers == 1) {
+      cfg_.ts_rate_decimator[0] = 1;
+      cfg_.temporal_layering_mode = 0;
+    }
 
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
@@ -299,30 +389,39 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     cfg_.rc_max_quantizer = 56;
     cfg_.g_threads = 1;
     cfg_.kf_max_dist = 9999;
-    cfg_.rc_target_bitrate = 1600;
     cfg_.rc_overshoot_pct = 50;
     cfg_.rc_undershoot_pct = 50;
 
-    cfg_.layer_target_bitrate[0] = 100;
-    cfg_.layer_target_bitrate[1] = 140;
-    cfg_.layer_target_bitrate[2] = 200;
-    cfg_.layer_target_bitrate[3] = 250;
-    cfg_.layer_target_bitrate[4] = 350;
-    cfg_.layer_target_bitrate[5] = 500;
-    cfg_.layer_target_bitrate[6] = 450;
-    cfg_.layer_target_bitrate[7] = 630;
-    cfg_.layer_target_bitrate[8] = 900;
+    cfg_.rc_target_bitrate = 0;
+    for (int sl = 0; sl < number_spatial_layers; sl++) {
+      int spatial_bitrate = 0;
+      if (number_spatial_layers <= 3)
+        spatial_bitrate = kSpatialLayerBitrate[sl];
+      for (int tl = 0; tl < number_temporal_layers; tl++) {
+        int layer = sl * number_temporal_layers + tl;
+        if (number_temporal_layers == 3)
+          cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 2)
+          cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 1)
+          cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+      }
+      cfg_.rc_target_bitrate += spatial_bitrate;
+    }
 
     cfg_.kf_min_dist = key_interval_;
     cfg_.kf_max_dist = key_interval_;
   }
 
-  void SetConfigSvc() {
+  void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) {
     rc_cfg_.width = 1280;
     rc_cfg_.height = 720;
+    rc_cfg_.ss_number_layers = number_spatial_layers;
+    rc_cfg_.ts_number_layers = number_temporal_layers;
     rc_cfg_.max_quantizer = 56;
     rc_cfg_.min_quantizer = 2;
-    rc_cfg_.target_bandwidth = 1600;
     rc_cfg_.buf_initial_sz = 500;
     rc_cfg_.buf_optimal_sz = 600;
     rc_cfg_.buf_sz = 1000;
@@ -330,31 +429,55 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     rc_cfg_.overshoot_pct = 50;
     rc_cfg_.max_intra_bitrate_pct = 900;
     rc_cfg_.framerate = 30.0;
-    rc_cfg_.ss_number_layers = 3;
-    rc_cfg_.ts_number_layers = 3;
     rc_cfg_.rc_mode = VPX_CBR;
     rc_cfg_.aq_mode = aq_mode_;
 
-    rc_cfg_.scaling_factor_num[0] = 1;
-    rc_cfg_.scaling_factor_den[0] = 4;
-    rc_cfg_.scaling_factor_num[1] = 2;
-    rc_cfg_.scaling_factor_den[1] = 4;
-    rc_cfg_.scaling_factor_num[2] = 4;
-    rc_cfg_.scaling_factor_den[2] = 4;
-
-    rc_cfg_.ts_rate_decimator[0] = 4;
-    rc_cfg_.ts_rate_decimator[1] = 2;
-    rc_cfg_.ts_rate_decimator[2] = 1;
-
-    rc_cfg_.layer_target_bitrate[0] = 100;
-    rc_cfg_.layer_target_bitrate[1] = 140;
-    rc_cfg_.layer_target_bitrate[2] = 200;
-    rc_cfg_.layer_target_bitrate[3] = 250;
-    rc_cfg_.layer_target_bitrate[4] = 350;
-    rc_cfg_.layer_target_bitrate[5] = 500;
-    rc_cfg_.layer_target_bitrate[6] = 450;
-    rc_cfg_.layer_target_bitrate[7] = 630;
-    rc_cfg_.layer_target_bitrate[8] = 900;
+    if (number_spatial_layers == 3) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 4;
+      rc_cfg_.scaling_factor_num[1] = 2;
+      rc_cfg_.scaling_factor_den[1] = 4;
+      rc_cfg_.scaling_factor_num[2] = 4;
+      rc_cfg_.scaling_factor_den[2] = 4;
+    } else if (number_spatial_layers == 2) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 2;
+      rc_cfg_.scaling_factor_num[1] = 2;
+      rc_cfg_.scaling_factor_den[1] = 2;
+    } else if (number_spatial_layers == 1) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 1;
+    }
+
+    if (number_temporal_layers == 3) {
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    } else if (number_temporal_layers == 2) {
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (number_temporal_layers == 1) {
+      rc_cfg_.ts_rate_decimator[0] = 1;
+    }
+
+    rc_cfg_.target_bandwidth = 0;
+    for (int sl = 0; sl < number_spatial_layers; sl++) {
+      int spatial_bitrate = 0;
+      if (number_spatial_layers <= 3)
+        spatial_bitrate = kSpatialLayerBitrate[sl];
+      for (int tl = 0; tl < number_temporal_layers; tl++) {
+        int layer = sl * number_temporal_layers + tl;
+        if (number_temporal_layers == 3)
+          rc_cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 2)
+          rc_cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 1)
+          rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+      }
+      rc_cfg_.target_bandwidth += spatial_bitrate;
+    }
 
     for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
       for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
@@ -374,6 +497,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   int current_superframe_;
   uint32_t sizes_[8];
   int key_interval_;
+  int dynamic_spatial_layers_;
 };
 
 TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
@@ -384,6 +508,8 @@ TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
 TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
 
+TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); }
+
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));

From 883863001652627f47dc1ecc6e42294687c8785b Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Fri, 16 Dec 2022 10:21:00 -0800
Subject: [PATCH 483/926] Add vpx_highbd_comp_avg_pred_c() test.

Change-Id: I6b2c3379c49a62e56e5ac56fd4782a50b3c4e12a
---
 test/comp_avg_pred_test.cc | 163 ++++++++++++++++++++++++++-----------
 1 file changed, 115 insertions(+), 48 deletions(-)

diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 3977a2d0b5..66dc4eb4e1 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -22,13 +22,14 @@ namespace {
 using ::libvpx_test::ACMRandom;
 using ::libvpx_test::Buffer;
 
-typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h,
-                            const uint8_t *c, int c_stride);
-
-uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
+template <typename Pixel>
+Pixel avg_with_rounding(Pixel a, Pixel b) {
+  return (a + b + 1) >> 1;
+}
 
-void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
-                    int width, int height, Buffer<uint8_t> *avg) {
+template <typename Pixel>
+void reference_pred(const Buffer<Pixel> &pred, const Buffer<Pixel> &ref,
+                    int width, int height, Buffer<Pixel> *avg) {
   ASSERT_NE(avg->TopLeftPixel(), nullptr);
   ASSERT_NE(pred.TopLeftPixel(), nullptr);
   ASSERT_NE(ref.TopLeftPixel(), nullptr);
@@ -36,12 +37,16 @@ void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       avg->TopLeftPixel()[y * avg->stride() + x] =
-          avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x],
-                            ref.TopLeftPixel()[y * ref.stride() + x]);
+          avg_with_rounding<Pixel>(pred.TopLeftPixel()[y * pred.stride() + x],
+                                   ref.TopLeftPixel()[y * ref.stride() + x]);
     }
   }
 }
 
+using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h,
+                             const uint8_t *c, int c_stride);
+
+template <int bitdepth, typename Pixel>
 class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
  public:
   virtual void SetUp() {
@@ -49,15 +54,19 @@ class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  void TestSizeCombinations();
+  void TestCompareReferenceRandom();
+  void TestSpeed();
+
  protected:
   AvgPredFunc avg_pred_func_;
   ACMRandom rnd_;
 };
 
-TEST_P(AvgPredTest, SizeCombinations) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() {
   // This is called as part of the sub pixel variance. As such it must be one of
   // the variance block sizes.
-
   for (int width_pow = 2; width_pow <= 6; ++width_pow) {
     for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
          ++height_pow) {
@@ -70,23 +79,30 @@ TEST_P(AvgPredTest, SizeCombinations) {
         const int width = 1 << width_pow;
         const int height = 1 << height_pow;
         // Only the reference buffer may have a stride not equal to width.
-        Buffer<uint8_t> ref =
-            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(pred.Init());
-        Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(avg_ref.Init());
-        Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(avg_chk.Init());
+        const int bitdepth_mask = (1 << bitdepth) - 1;
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
 
-        ref.Set(&rnd_, &ACMRandom::Rand8);
-        pred.Set(&rnd_, &ACMRandom::Rand8);
-
-        reference_pred(pred, ref, width, height, &avg_ref);
-        ASM_REGISTER_STATE_CHECK(
-            avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width,
-                           height, ref.TopLeftPixel(), ref.stride()));
+        reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+        ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+            (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+            width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
 
         EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
         if (HasFailure()) {
@@ -99,26 +115,36 @@ TEST_P(AvgPredTest, SizeCombinations) {
   }
 }
 
-TEST_P(AvgPredTest, CompareReferenceRandom) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() {
   const int width = 64;
   const int height = 32;
-  Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
+  Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
   ASSERT_TRUE(ref.Init());
-  Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
   ASSERT_TRUE(pred.Init());
-  Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
   ASSERT_TRUE(avg_ref.Init());
-  Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
   ASSERT_TRUE(avg_chk.Init());
 
   for (int i = 0; i < 500; ++i) {
-    ref.Set(&rnd_, &ACMRandom::Rand8);
-    pred.Set(&rnd_, &ACMRandom::Rand8);
+    const int bitdepth_mask = (1 << bitdepth) - 1;
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+      }
+    }
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+      }
+    }
 
-    reference_pred(pred, ref, width, height, &avg_ref);
-    ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(),
-                                            pred.TopLeftPixel(), width, height,
-                                            ref.TopLeftPixel(), ref.stride()));
+    reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+    ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+        (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+        width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
     EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
     if (HasFailure()) {
       printf("Width: %d Height: %d\n", width, height);
@@ -128,7 +154,8 @@ TEST_P(AvgPredTest, CompareReferenceRandom) {
   }
 }
 
-TEST_P(AvgPredTest, DISABLED_Speed) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
   for (int width_pow = 2; width_pow <= 6; ++width_pow) {
     for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
          ++height_pow) {
@@ -138,22 +165,30 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
       for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
         const int width = 1 << width_pow;
         const int height = 1 << height_pow;
-        Buffer<uint8_t> ref =
-            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(pred.Init());
-        Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(avg.Init());
-
-        ref.Set(&rnd_, &ACMRandom::Rand8);
-        pred.Set(&rnd_, &ACMRandom::Rand8);
+        const int bitdepth_mask = (1 << bitdepth) - 1;
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
 
         vpx_usec_timer timer;
         vpx_usec_timer_start(&timer);
         for (int i = 0; i < 10000000 / (width * height); ++i) {
-          avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height,
-                         ref.TopLeftPixel(), ref.stride());
+          avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
+                         (uint8_t *)pred.TopLeftPixel(), width, height,
+                         (uint8_t *)ref.TopLeftPixel(), ref.stride());
         }
         vpx_usec_timer_mark(&timer);
 
@@ -166,26 +201,58 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
   }
 }
 
-INSTANTIATE_TEST_SUITE_P(C, AvgPredTest,
+using AvgPredTestLBD = AvgPredTest<8, uint8_t>;
+
+TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_c));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_sse2));
 #endif  // HAVE_SSE2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_neon));
 #endif  // HAVE_NEON
 
 #if HAVE_VSX
-INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_vsx));
 #endif  // HAVE_VSX
 
 #if HAVE_LSX
-INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_lsx));
 #endif  // HAVE_LSX
+
+#if CONFIG_VP9_HIGHBITDEPTH
+using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h,
+                                   const uint16_t *c, int c_stride);
+
+template <HighbdAvgPredFunc fn>
+void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h,
+                    const uint8_t *c, int c_stride) {
+  fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride);
+}
+
+using AvgPredTestHBD = AvgPredTest<12, uint16_t>;
+
+TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace

From e022d5b71ffca486b5bc174702a9fe0e35038c75 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Tue, 20 Dec 2022 15:43:44 -0800
Subject: [PATCH 484/926] [x86]: Add vpx_highbd_comp_avg_pred_sse2().

C vs SSE2

4x4: 3.38x
8x8: 3.45x
16x16: 2.06x
32x32: 2.19x
64x64: 1.39x

Change-Id: I46638fe187b49a78fee554114fac51c485d74474
---
 test/comp_avg_pred_test.cc         |  8 ++++-
 vpx_dsp/vpx_dsp_rtcd_defs.pl       |  2 +-
 vpx_dsp/x86/highbd_variance_sse2.c | 47 ++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 66dc4eb4e1..70aeab8d7e 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -185,7 +185,7 @@ void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
 
         vpx_usec_timer timer;
         vpx_usec_timer_start(&timer);
-        for (int i = 0; i < 10000000 / (width * height); ++i) {
+        for (int i = 0; i < 100000000 / (width * height); ++i) {
           avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
                          (uint8_t *)pred.TopLeftPixel(), width, height,
                          (uint8_t *)ref.TopLeftPixel(), ref.stride());
@@ -254,5 +254,11 @@ INSTANTIATE_TEST_SUITE_P(
     C, AvgPredTestHBD,
     ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
 
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
+#endif  // HAVE_SSE2
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b6d656820f..8725821b67 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1400,7 +1400,7 @@ ()
   specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
-  specialize qw/vpx_highbd_comp_avg_pred neon/;
+  specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
 
   #
   # Subpixel Variance
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 7c8d79b09e..381e0ad193 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -559,3 +560,49 @@ FNS(sse2)
 
 #undef FNS
 #undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  if (width > 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+        const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+        const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+        const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+        _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+        _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+      const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+      _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+      comp_pred += 8 << 1;
+      pred += 8 << 1;
+      ref += ref_stride << 1;
+    }
+  } else {
+    assert(width == 4);
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+      const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+      _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+      comp_pred += 4 << 1;
+      pred += 4 << 1;
+      ref += ref_stride << 1;
+    }
+  }
+}

From 11151943b1877824da6086ea0c89b5617caecb67 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 21 Dec 2022 11:13:40 -0500
Subject: [PATCH 485/926] Remove references to deprecated NumPy type aliases

This change replaces references to a number of deprecated NumPy type
aliases (np.bool, np.int, np.float, np.complex, np.object, np.str)
with their recommended replacement
(bool, int, float, complex, object, str).

NumPy 1.24 drops the deprecated aliases
so we must remove uses before updating NumPy.

Change-Id: I9f5dfcbb11fe6534fce358054f210c7653f278c3
---
 tools/3D-Reconstruction/MotionEST/Exhaust.py     | 2 +-
 tools/3D-Reconstruction/MotionEST/GroundTruth.py | 4 ++--
 tools/3D-Reconstruction/MotionEST/MotionEST.py   | 4 ++--
 tools/3D-Reconstruction/MotionEST/Util.py        | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/3D-Reconstruction/MotionEST/Exhaust.py b/tools/3D-Reconstruction/MotionEST/Exhaust.py
index 2d6a4d8114..d763de8562 100644
--- a/tools/3D-Reconstruction/MotionEST/Exhaust.py
+++ b/tools/3D-Reconstruction/MotionEST/Exhaust.py
@@ -83,7 +83,7 @@ def __init__(self, cur_f, ref_f, blk_size, wnd_size, beta, metric=MSE):
     self.beta = beta
     self.metric = metric
     super(ExhaustNeighbor, self).__init__(cur_f, ref_f, blk_size)
-    self.assign = np.zeros((self.num_row, self.num_col), dtype=np.bool)
+    self.assign = np.zeros((self.num_row, self.num_col), dtype=bool)
 
   """
     estimate neighbor loss:
diff --git a/tools/3D-Reconstruction/MotionEST/GroundTruth.py b/tools/3D-Reconstruction/MotionEST/GroundTruth.py
index 12bc53ff73..37305898a7 100644
--- a/tools/3D-Reconstruction/MotionEST/GroundTruth.py
+++ b/tools/3D-Reconstruction/MotionEST/GroundTruth.py
@@ -29,7 +29,7 @@ class GroundTruth(MotionEST):
   def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None):
     self.name = 'ground truth'
     super(GroundTruth, self).__init__(cur_f, ref_f, blk_sz)
-    self.mask = np.zeros((self.num_row, self.num_col), dtype=np.bool)
+    self.mask = np.zeros((self.num_row, self.num_col), dtype=bool)
     if gt_path:
       with open(gt_path) as gt_file:
         lines = gt_file.readlines()
@@ -42,7 +42,7 @@ def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None):
               self.mask[i, -j - 1] = True
               continue
             #the order of original file is flipped on the x axis
-            self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=np.int)
+            self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=int)
     else:
       self.mf = mf
       self.mask = mask
diff --git a/tools/3D-Reconstruction/MotionEST/MotionEST.py b/tools/3D-Reconstruction/MotionEST/MotionEST.py
index 0959530fa0..fc393818d9 100644
--- a/tools/3D-Reconstruction/MotionEST/MotionEST.py
+++ b/tools/3D-Reconstruction/MotionEST/MotionEST.py
@@ -28,8 +28,8 @@ def __init__(self, cur_f, ref_f, blk_sz):
     self.ref_f = ref_f
     self.blk_sz = blk_sz
     #convert RGB to YUV
-    self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=np.int)
-    self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=np.int)
+    self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=int)
+    self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=int)
     #frame size
     self.width = self.cur_f.size[0]
     self.height = self.cur_f.size[1]
diff --git a/tools/3D-Reconstruction/MotionEST/Util.py b/tools/3D-Reconstruction/MotionEST/Util.py
index 551881cfd7..c2416163be 100644
--- a/tools/3D-Reconstruction/MotionEST/Util.py
+++ b/tools/3D-Reconstruction/MotionEST/Util.py
@@ -18,7 +18,7 @@
 def MSE(blk1, blk2):
   return np.mean(
       LA.norm(
-          np.array(blk1, dtype=np.int) - np.array(blk2, dtype=np.int), axis=2))
+          np.array(blk1, dtype=int) - np.array(blk2, dtype=int), axis=2))
 
 
 def drawMF(img, blk_sz, mf):

From ab1192c2907185d59f0044230602ea6025a42844 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 5 Jan 2023 12:20:03 +0000
Subject: [PATCH 486/926] Use lane-referencing intrinsics in Neon convolution
 kernels

The Neon convolution helper functions take a pointer to a filter and
load the 8 values into a single Neon register. For some reason,
filter values 3 and 4 are then duplicated into their own separate
registers.

This patch modifies these helper functions so that they access filter
values 3 and 4 via the lane-referencing versions of the various Neon
multiply instructions. This reduces register pressure and tidies up
the source code quite a bit.

Change-Id: Ia4aeee8b46fe218658fb8577dc07ff04a9324b3e
---
 vpx_dsp/arm/vpx_convolve8_neon.c        | 166 +++++++-----------------
 vpx_dsp/arm/vpx_convolve8_neon.h        |  20 +--
 vpx_dsp/arm/vpx_scaled_convolve8_neon.c |   9 +-
 3 files changed, 59 insertions(+), 136 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index ca5222fa07..28018398a5 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -807,16 +807,13 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   if (h == 4) {
     uint8x8_t d01, d23;
-    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
-        d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     int16x8_t tt0, tt1, tt2, tt3;
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
     __builtin_prefetch(src + 2 * src_stride);
     __builtin_prefetch(src + 3 * src_stride);
-    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -848,14 +845,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vget_low_s16(tt2);
       s10 = vget_low_s16(tt3);
 
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -882,8 +875,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       w -= 4;
     } while (w != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int width;
     const uint8_t *s;
     uint8x8_t t4, t5, t6, t7;
@@ -926,14 +917,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
@@ -1001,22 +988,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                           filter4);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                           filter4);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                           filter4);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                           filter4);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
-                           filter4);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
-                           filter4);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
-                           filter4);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
-                           filter3, filter4);
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
           store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
@@ -1060,8 +1039,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   if (h == 4) {
     uint8x8_t d01, d23;
-    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
-        d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     int16x8_t tt0, tt1, tt2, tt3;
     uint32x4_t d0123 = vdupq_n_u32(0);
 
@@ -1069,8 +1047,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     __builtin_prefetch(src + 1 * src_stride);
     __builtin_prefetch(src + 2 * src_stride);
     __builtin_prefetch(src + 3 * src_stride);
-    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -1102,14 +1078,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vget_low_s16(tt2);
       s10 = vget_low_s16(tt3);
 
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1139,8 +1111,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       w -= 4;
     } while (w != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int width;
     const uint8_t *s;
     uint8x8_t t4, t5, t6, t7;
@@ -1185,14 +1155,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
@@ -1275,22 +1241,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                           filter4);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                           filter4);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                           filter4);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                           filter4);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
-                           filter4);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
-                           filter4);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
-                           filter4);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
-                           filter3, filter4);
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
@@ -1348,8 +1306,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
@@ -1386,14 +1342,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1416,8 +1368,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int height;
     const uint8_t *s;
     uint8_t *d;
@@ -1468,14 +1418,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         vst1_u8(d, t0);
         d += dst_stride;
@@ -1520,8 +1466,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     uint32x4_t d0123 = vdupq_n_u32(0);
@@ -1559,14 +1503,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1597,8 +1537,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int height;
     const uint8_t *s;
     uint8_t *d;
@@ -1650,14 +1588,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         d01 = vcombine_u8(t0, t1);
         d23 = vcombine_u8(t2, t3);
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index b112cb249a..b8dfce71ea 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -176,9 +176,7 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
                                     const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filters,
-                                    const int16x4_t filter3,
-                                    const int16x4_t filter4) {
+                                    const int16x8_t filters) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
   int16x4_t sum;
@@ -189,8 +187,8 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
   sum = vmla_lane_s16(sum, s5, filters_hi, 1);
   sum = vmla_lane_s16(sum, s6, filters_hi, 2);
   sum = vmla_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
-  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0));
   return sum;
 }
 
@@ -198,9 +196,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
                                     const int16x8_t s2, const int16x8_t s3,
                                     const int16x8_t s4, const int16x8_t s5,
                                     const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filters,
-                                    const int16x8_t filter3,
-                                    const int16x8_t filter4) {
+                                    const int16x8_t filters) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
   int16x8_t sum;
@@ -211,15 +207,13 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
   sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
-  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
   return vqrshrun_n_s16(sum, 7);
 }
 
 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
                                        const int16x8_t filters) {
-  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
   int16x8_t ss[8];
 
   ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
@@ -232,7 +226,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
   ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
 
   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
-                     filters, filter3, filter4);
+                     filters);
 }
 
 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index 8edf8a66e6..8491ca7ac5 100644
--- a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -38,8 +38,6 @@ static INLINE void scaledconvolve_horiz_w4(
         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
         if (x_q4 & SUBPEL_MASK) {
           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
           uint8x8_t s[8], d;
           int16x8_t ss[4];
           int16x4_t t[8], tt;
@@ -61,7 +59,7 @@ static INLINE void scaledconvolve_horiz_w4(
           t[7] = vget_high_s16(ss[3]);
 
           tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters, filter3, filter4);
+                           filters);
           d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
           vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
         } else {
@@ -167,8 +165,6 @@ static INLINE void scaledconvolve_vert_w4(
 
     if (y_q4 & SUBPEL_MASK) {
       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
       uint8x8_t s[8], d;
       int16x4_t t[8], tt;
 
@@ -183,8 +179,7 @@ static INLINE void scaledconvolve_vert_w4(
       t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
       t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
 
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
-                       filter3, filter4);
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
       d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
     } else {

From 708c4aa8540ec81aa5f0d93edc2e1e4d6d4581ac Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 5 Jan 2023 15:04:53 +0000
Subject: [PATCH 487/926] Use Neon load/store helper functions consistently

Define all Neon load/store helper functions in mem_neon.h and use
them consistently in Neon convolution functions.

Change-Id: I57905bc0a3574c77999cf4f4a73442c3420fa2be
---
 vp9/encoder/arm/neon/vp9_frame_scale_neon.c |   1 +
 vpx_dsp/arm/mem_neon.h                      | 157 +++++++++++++++++
 vpx_dsp/arm/vpx_convolve8_neon.c            | 177 +++++---------------
 vpx_dsp/arm/vpx_convolve8_neon.h            |  56 -------
 vpx_dsp/arm/vpx_scaled_convolve8_neon.c     |   1 +
 5 files changed, 198 insertions(+), 194 deletions(-)

diff --git a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
index 69b8cfffd7..bc8dd4a341 100644
--- a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
+++ b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_dsp/vpx_filter.h"
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 84aae161b3..19cfc7c7f2 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -201,4 +201,161 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
   buf += stride;
   vst1_lane_u32((uint32_t *)buf, a_u32, 1);
 }
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
+                                 const uint8x16_t s0, const uint8x16_t s1,
+                                 const uint8x16_t s2, const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3,
+                                const uint8x8_t s4, const uint8x8_t s5,
+                                const uint8x8_t s6, const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
+                                 const uint8x16_t s0, const uint8x16_t s1,
+                                 const uint8x16_t s2, const uint8x16_t s3,
+                                 const uint8x16_t s4, const uint8x16_t s5,
+                                 const uint8x16_t s6, const uint8x16_t s7) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+  s += p;
+  vst1q_u8(s, s4);
+  s += p;
+  vst1q_u8(s, s5);
+  s += p;
+  vst1q_u8(s, s6);
+  s += p;
+  vst1q_u8(s, s7);
+}
+
 #endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 28018398a5..dba436b1a0 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -124,33 +124,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       int16x8_t t01, t23;
       uint8x8_t d01, d23;
 
-      s0 = vld1q_u8(src);
-      src += src_stride;
-      s1 = vld1q_u8(src);
-      src += src_stride;
-      s2 = vld1q_u8(src);
-      src += src_stride;
-      s3 = vld1q_u8(src);
-      src += src_stride;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
       t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
       t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
       t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
       t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
-
       t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
       t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
       d01 = vqrshrun_n_s16(t01, 7);
       d23 = vqrshrun_n_s16(t23, 7);
 
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
@@ -165,20 +154,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s = src;
       d = dst;
       do {
-        s0 = vld1q_u8(s + 0 * src_stride);
-        s1 = vld1q_u8(s + 1 * src_stride);
-        s2 = vld1q_u8(s + 2 * src_stride);
-        s3 = vld1q_u8(s + 3 * src_stride);
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
         d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
         d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
         d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
         d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
 
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
@@ -221,20 +204,12 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       dd01 = vdup_n_u8(0);
       dd23 = vdup_n_u8(0);
 
-      s0 = vld1q_u8(src);
-      src += src_stride;
-      s1 = vld1q_u8(src);
-      src += src_stride;
-      s2 = vld1q_u8(src);
-      src += src_stride;
-      s3 = vld1q_u8(src);
-      src += src_stride;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
       t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
       t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
       t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
       t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
-
       t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
       t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
       d01 = vqrshrun_n_s16(t01, 7);
@@ -242,17 +217,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
 
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
@@ -267,29 +240,21 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s = src;
       d = dst;
       do {
-        s0 = vld1q_u8(s + 0 * src_stride);
-        s1 = vld1q_u8(s + 1 * src_stride);
-        s2 = vld1q_u8(s + 2 * src_stride);
-        s3 = vld1q_u8(s + 3 * src_stride);
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
         d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
         d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
         d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
         d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
 
-        dd0 = vld1_u8(d + 0 * dst_stride);
-        dd1 = vld1_u8(d + 1 * dst_stride);
-        dd2 = vld1_u8(d + 2 * dst_stride);
-        dd3 = vld1_u8(d + 3 * dst_stride);
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
         d0 = vrhadd_u8(d0, dd0);
         d1 = vrhadd_u8(d1, dd1);
         d2 = vrhadd_u8(d2, dd2);
         d3 = vrhadd_u8(d3, dd3);
 
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
@@ -332,14 +297,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     int32x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23;
 
-    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    src += 4 * src_stride;
-    t4 = vld1_u8(src);
-    src += src_stride;
-    t5 = vld1_u8(src);
-    src += src_stride;
-    t6 = vld1_u8(src);
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
 
     /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
     s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -387,18 +346,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
       d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
       d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
       d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
 
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       /* Prepare block for next iteration - re-using as much as possible. */
       /* Shuffle everything up four rows. */
@@ -408,6 +360,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s3456 = s78910;
 
       src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
@@ -425,14 +378,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s = src;
       d = dst;
 
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      s += 4 * src_stride;
-      t4 = vld1_u8(s);
-      s += src_stride;
-      t5 = vld1_u8(s);
-      s += src_stride;
-      t6 = vld1_u8(s);
-      s += src_stride;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
 
       /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
       s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -498,10 +445,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                      correction, filters);
         d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
                                      correction, filters);
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         /* Prepare block for next iteration - re-using as much as possible. */
         /* Shuffle everything up four rows. */
@@ -555,14 +500,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     int32x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23, dd01, dd23;
 
-    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    src += 4 * src_stride;
-    t4 = vld1_u8(src);
-    src += src_stride;
-    t5 = vld1_u8(src);
-    src += src_stride;
-    t6 = vld1_u8(src);
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
 
     /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
     s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -610,23 +549,17 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
       d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
       d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
       d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
 
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       /* Prepare block for next iteration - re-using as much as possible. */
       /* Shuffle everything up four rows. */
@@ -636,6 +569,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s3456 = s78910;
 
       src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
@@ -653,14 +587,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s = src;
       d = dst;
 
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      s += 4 * src_stride;
-      t4 = vld1_u8(s);
-      s += src_stride;
-      t5 = vld1_u8(s);
-      s += src_stride;
-      t6 = vld1_u8(s);
-      s += src_stride;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
 
       /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
       s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -727,19 +655,14 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
                                      correction, filters);
 
-        dd0 = vld1_u8(d + 0 * dst_stride);
-        dd1 = vld1_u8(d + 1 * dst_stride);
-        dd2 = vld1_u8(d + 2 * dst_stride);
-        dd3 = vld1_u8(d + 3 * dst_stride);
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
         d0 = vrhadd_u8(d0, dd0);
         d1 = vrhadd_u8(d1, dd1);
         d2 = vrhadd_u8(d2, dd2);
         d3 = vrhadd_u8(d3, dd3);
 
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         /* Prepare block for next iteration - re-using as much as possible. */
         /* Shuffle everything up four rows. */
@@ -765,28 +688,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
 
 #else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
-static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
-                                const uint8x8_t s0, const uint8x8_t s1,
-                                const uint8x8_t s2, const uint8x8_t s3,
-                                const uint8x8_t s4, const uint8x8_t s5,
-                                const uint8x8_t s6, const uint8x8_t s7) {
-  vst1_u8(s, s0);
-  s += p;
-  vst1_u8(s, s1);
-  s += p;
-  vst1_u8(s, s2);
-  s += p;
-  vst1_u8(s, s3);
-  s += p;
-  vst1_u8(s, s4);
-  s += p;
-  vst1_u8(s, s5);
-  s += p;
-  vst1_u8(s, s6);
-  s += p;
-  vst1_u8(s, s7);
-}
-
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index b8dfce71ea..26a5fa688a 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -16,62 +16,6 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
-static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
-                               uint8x8_t *const s0, uint8x8_t *const s1,
-                               uint8x8_t *const s2, uint8x8_t *const s3) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-}
-
-static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
-                               uint8x8_t *const s0, uint8x8_t *const s1,
-                               uint8x8_t *const s2, uint8x8_t *const s3,
-                               uint8x8_t *const s4, uint8x8_t *const s5,
-                               uint8x8_t *const s6, uint8x8_t *const s7) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-  s += p;
-  *s4 = vld1_u8(s);
-  s += p;
-  *s5 = vld1_u8(s);
-  s += p;
-  *s6 = vld1_u8(s);
-  s += p;
-  *s7 = vld1_u8(s);
-}
-
-static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
-                                uint8x16_t *const s0, uint8x16_t *const s1,
-                                uint8x16_t *const s2, uint8x16_t *const s3,
-                                uint8x16_t *const s4, uint8x16_t *const s5,
-                                uint8x16_t *const s6, uint8x16_t *const s7) {
-  *s0 = vld1q_u8(s);
-  s += p;
-  *s1 = vld1q_u8(s);
-  s += p;
-  *s2 = vld1q_u8(s);
-  s += p;
-  *s3 = vld1q_u8(s);
-  s += p;
-  *s4 = vld1q_u8(s);
-  s += p;
-  *s5 = vld1q_u8(s);
-  s += p;
-  *s6 = vld1q_u8(s);
-  s += p;
-  *s7 = vld1q_u8(s);
-}
-
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
diff --git a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index 8491ca7ac5..b8e3c5e540 100644
--- a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"

From e067469e77bec79f7f52d074c440f01bfa4c14af Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 10 Jan 2023 13:49:15 -0800
Subject: [PATCH 488/926] build: replace egrep with grep -E

avoids a warning on some platforms:
egrep: warning: egrep is obsolescent; using grep -E

Bug: webm:1786
Change-Id: Ia434297731303aacb0b02cf3dcbfd8e03936485d
Fixed: webm:1786
---
 build/make/gen_asm_deps.sh | 2 +-
 libs.mk                    | 4 ++--
 test/tools_common.sh       | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/build/make/gen_asm_deps.sh b/build/make/gen_asm_deps.sh
index 6a7bff9ebc..3bd4d125f1 100755
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -42,7 +42,7 @@ done
 
 [ -n "$srcfile" ] || show_help
 sfx=${sfx:-asm}
-includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
+includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
            perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
 #" restore editor state
 for inc in ${includes}; do
diff --git a/libs.mk b/libs.mk
index f65e99242c..fb6fbbeb20 100644
--- a/libs.mk
+++ b/libs.mk
@@ -446,13 +446,13 @@ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
 # YASM
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " equ " $$3}' > $@
 else
 ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION))
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@
 	@echo "        END" $(ADS2GAS) >> $@
 CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 844a12534d..0e4a0a5c0e 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -133,7 +133,7 @@ vpx_config_option_enabled() {
   vpx_config_option="${1}"
   vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h"
   config_line=$(grep "${vpx_config_option}" "${vpx_config_file}")
-  if echo "${config_line}" | egrep -q '1$'; then
+  if echo "${config_line}" | grep -E -q '1$'; then
     echo yes
   fi
 }
@@ -222,7 +222,7 @@ filter_strings() {
 
   if [ -n "${filter}" ]; then
     for s in ${strings}; do
-      if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+      if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then
         filtered_strings="${filtered_strings} ${s}"
       fi
     done

From f952068691bcc397a17721d004ac84e63e46bb3c Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 18 May 2022 14:14:56 +0100
Subject: [PATCH 489/926] Implement horizontal convolutions using Neon USDOT
 instruction

Add additional AArch64 paths for vpx_convolve8_horiz_neon and
vpx_convolve8_avg_horiz_neon that use the Armv8.6-A USDOT (mixed-sign
dot-product) instruction. The USDOT instruction takes an 8-bit
unsigned operand vector and a signed 8-bit operand vector to produce
a signed 32-bit result. This is helpful because convolution filters
often have both positive and negative values, while the 8-bit pixel
channel data being filtered is all unsigned. As a result, the USDOT
convolution paths added here do not have to do the "transform the
pixel channel data to [-128, 128) and correct for it later" dance
that we have to do with the SDOT paths.

The USDOT instruction is optional from Armv8.2 to Armv8.5 but
mandatory from Armv8.6 onwards. The availability of the USDOT
instruction is indicated by the feature macro
__ARM_FEATURE_MATMUL_INT8. The SDOT paths are retained for use on
target CPUs that do not implement the USDOT instructions.

Change-Id: If19f5872c3453458a8cfb7c7d2be82a2c0eab46a
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 271 ++++++++++++++++++++++++++-----
 vpx_dsp/arm/vpx_convolve8_neon.h |  91 ++++++++---
 2 files changed, 299 insertions(+), 63 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index dba436b1a0..81ceb518dd 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,7 +31,9 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__aarch64__) && \
+    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
@@ -96,6 +98,175 @@ static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
   *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
 }
 
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23, dd01, dd23;
+      dd01 = vdup_n_u8(0);
+      dd23 = vdup_n_u8(0);
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+#else  // !defined(__ARM_FEATURE_MATMUL_INT8)
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -126,10 +297,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
       t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
       t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
       d01 = vqrshrun_n_s16(t01, 7);
@@ -156,10 +327,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+        d0 =
+            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+        d1 =
+            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+        d2 =
+            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+        d3 =
+            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -206,10 +381,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
       t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
       t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
       d01 = vqrshrun_n_s16(t01, 7);
@@ -242,10 +417,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+        d0 =
+            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+        d1 =
+            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+        d2 =
+            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+        d3 =
+            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
 
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
@@ -267,6 +446,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+#endif  // defined(__ARM_FEATURE_MATMUL_INT8)
+
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const InterpKernel *filter, int x0_q4,
@@ -342,10 +523,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
       d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
 
@@ -437,14 +618,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
         s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                     correction, filters);
-        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                     correction, filters);
-        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                     correction, filters);
-        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                     correction, filters);
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -545,10 +726,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
       d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
 
@@ -646,14 +827,14 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
         s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                     correction, filters);
-        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                     correction, filters);
-        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                     correction, filters);
-        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                     correction, filters);
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
 
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
@@ -686,7 +867,9 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else  // !(defined(__aarch64__) &&
+       //   (defined(__ARM_FEATURE_DOTPROD) ||
+       //    defined(__ARM_FEATURE_MATMUL_INT8)))
 
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
@@ -1528,4 +1711,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // #if defined(__aarch64__) &&
+        //     (defined(__ARM_FEATURE_DOTPROD) ||
+        //      defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 26a5fa688a..a62e4f461c 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -18,10 +18,10 @@
 
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
-static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
-                                                const int8x16_t samples_hi,
-                                                const int32x4_t correction,
-                                                const int8x8_t filters) {
+static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+                                                 const int8x16_t samples_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
   /* Sample range-clamping and permutation are performed by the caller. */
   int32x4_t sum;
 
@@ -33,11 +33,11 @@ static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
   return sum;
 }
 
-static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
-                                        const int8x8_t filters,
-                                        const int32x4_t correction,
-                                        const uint8x16_t range_limit,
-                                        const uint8x16x2_t permute_tbl) {
+static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
   int8x16_t clamped_samples, permuted_samples[2];
   int32x4_t sum;
 
@@ -58,12 +58,12 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
   return sum;
 }
 
-static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
-                                                const int8x16_t samples0_hi,
-                                                const int8x16_t samples1_lo,
-                                                const int8x16_t samples1_hi,
-                                                const int32x4_t correction,
-                                                const int8x8_t filters) {
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+                                                 const int8x16_t samples0_hi,
+                                                 const int8x16_t samples1_lo,
+                                                 const int8x16_t samples1_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
   /* Sample range-clamping and permutation are performed by the caller. */
   int32x4_t sum0, sum1;
   int16x8_t sum;
@@ -81,11 +81,11 @@ static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
   return vqrshrun_n_s16(sum, 7);
 }
 
-static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
-                                        const int8x8_t filters,
-                                        const int32x4_t correction,
-                                        const uint8x16_t range_limit,
-                                        const uint8x16x3_t permute_tbl) {
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl) {
   int8x16_t clamped_samples, permuted_samples[3];
   int32x4_t sum0, sum1;
   int16x8_t sum;
@@ -116,6 +116,57 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
 
 #endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x2_t permute_tbl) {
+  uint8x16_t permuted_samples[2];
+  int32x4_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x3_t permute_tbl) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,

From 5645938c36b9cd1fa4f7c97da0e8c0ef0330d45d Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 18 May 2022 16:58:50 +0100
Subject: [PATCH 490/926] Implement vertical convolutions using Neon USDOT
 instruction

Add additional AArch64 paths for vpx_convolve8_vert_neon and
vpx_convolve8_avg_vert_neon that use the Armv8.6-A USDOT (mixed-sign
dot-product) instruction. The USDOT instruction takes an 8-bit
unsigned operand vector and a signed 8-bit operand vector to produce
a signed 32-bit result. This is helpful because convolution filters
often have both positive and negative values, while the 8-bit pixel
channel data being filtered is all unsigned. As a result, the USDOT
convolution paths added here do not have to do the "transform the
pixel channel data to [-128, 128) and correct for it later" dance
that we have to do with the SDOT paths.

The USDOT instruction is optional from Armv8.2 to Armv8.5 but
mandatory from Armv8.6 onwards. The availability of the USDOT
instruction is indicated by the feature macro
__ARM_FEATURE_MATMUL_INT8. The SDOT paths are retained for use on
target CPUs that do not implement the USDOT instructions.

Change-Id: Ifbf467681dd53bb1d26e22359885e6edde3c5c72
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 548 ++++++++++++++++++++++++++-----
 vpx_dsp/arm/vpx_convolve8_neon.h |  34 ++
 2 files changed, 505 insertions(+), 77 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 81ceb518dd..b4cdd58c70 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -54,50 +54,6 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
-                                        int8x8_t *a2, int8x8_t *a3,
-                                        int8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
-                                        int8x8_t *a2, int8x8_t *a3,
-                                        int8x16_t *b0, int8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
-}
-
 #if defined(__ARM_FEATURE_MATMUL_INT8)
 
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
@@ -265,6 +221,401 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
 #else  // !defined(__ARM_FEATURE_MATMUL_INT8)
 
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
@@ -446,7 +797,48 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif  // defined(__ARM_FEATURE_MATMUL_INT8)
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
 
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
@@ -496,13 +888,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     /* This operation combines a conventional transpose and the sample permute
      * (see horizontal case) required before computing the dot product.
      */
-    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
 
     do {
       uint8x8_t t7, t8, t9, t10;
@@ -514,7 +906,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
       s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
 
       /* Merge new data into block from previous iteration. */
       samples_LUT.val[0] = s3456;
@@ -577,19 +969,19 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       /* This operation combines a conventional transpose and the sample permute
        * (see horizontal case) required before computing the dot product.
        */
-      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
                            tran_concat_tbl);
 
       do {
@@ -602,7 +994,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
         s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
                              tran_concat_tbl);
 
         /* Merge new data into block from previous iteration. */
@@ -699,13 +1091,13 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     /* This operation combines a conventional transpose and the sample permute
      * (see horizontal case) required before computing the dot product.
      */
-    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
 
     do {
       uint8x8_t t7, t8, t9, t10;
@@ -717,7 +1109,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
       s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
 
       /* Merge new data into block from previous iteration. */
       samples_LUT.val[0] = s3456;
@@ -786,19 +1178,19 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       /* This operation combines a conventional transpose and the sample permute
        * (see horizontal case) required before computing the dot product.
        */
-      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
                            tran_concat_tbl);
 
       do {
@@ -811,7 +1203,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
         s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
                              tran_concat_tbl);
 
         /* Merge new data into block from previous iteration. */
@@ -867,6 +1259,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+#endif  // defined(__ARM_FEATURE_MATMUL_INT8)
+
 #else  // !(defined(__aarch64__) &&
        //   (defined(__ARM_FEATURE_DOTPROD) ||
        //    defined(__ARM_FEATURE_MATMUL_INT8)))
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index a62e4f461c..ed7f180538 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -118,6 +118,19 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
 
 #if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
 
+static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+                                                  const uint8x16_t samples_hi,
+                                                  const int8x8_t filters) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum;
+
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
 static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
                                           const int8x8_t filters,
                                           const uint8x16x2_t permute_tbl) {
@@ -138,6 +151,27 @@ static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
   return sum;
 }
 
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+                                                  const uint8x16_t samples0_hi,
+                                                  const uint8x16_t samples1_lo,
+                                                  const uint8x16_t samples1_hi,
+                                                  const int8x8_t filters) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
+}
+
 static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
                                           const int8x8_t filters,
                                           const uint8x16x3_t permute_tbl) {

From 32878bb1f3db472e642eb0c98d62b37f57b12f68 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Thu, 12 Jan 2023 11:03:28 -0800
Subject: [PATCH 491/926] variance_test.cc: Enable VpxHBDMseTest for C and
 SSE2.

Change-Id: I66c0db6c605876d6757684fd715614881ca261e7
---
 test/variance_test.cc | 72 +++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 33 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 8aed5d2ed9..ac6d226a5f 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -499,14 +499,21 @@ template <typename FunctionType>
 void MainTestClass<FunctionType>::RefTestMse() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size(); ++j) {
-      src_[j] = rnd_.Rand8();
-      ref_[j] = rnd_.Rand8();
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
     }
     unsigned int sse1, sse2;
     const int stride = width();
     ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
     variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
-                 stride, &sse2, false, VPX_BITS_8);
+                 stride, &sse2, use_high_bit_depth(), params_.bit_depth);
     EXPECT_EQ(sse1, sse2);
   }
 }
@@ -530,8 +537,15 @@ void MainTestClass<FunctionType>::RefTestSse() {
 
 template <typename FunctionType>
 void MainTestClass<FunctionType>::MaxTestMse() {
-  memset(src_, 255, block_size());
-  memset(ref_, 0, block_size());
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 0, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
   unsigned int sse;
   ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
   const unsigned int expected = block_size() * 255 * 255;
@@ -854,25 +868,24 @@ TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
-/* TODO(debargha): This test does not support the highbd version
 typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
 TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
 INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDMseTest,
-    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
-*/
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12),
+                      MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12),
+                      MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12),
+                      MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10),
+                      MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10),
+                      MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10),
+                      MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8),
+                      MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8),
+                      MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8),
+                      MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8)));
+
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest);
 
 INSTANTIATE_TEST_SUITE_P(
@@ -1138,22 +1151,15 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-/* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDMseTest,
-    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_12_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_10_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_10_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_8_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_8_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2)));
-*/
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12),
+        MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12),
+        MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10),
+        MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10),
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8)));
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDVarianceTest,

From 59d4a686166e1017654fe47178371d7101528baa Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Fri, 13 Jan 2023 07:30:07 -0800
Subject: [PATCH 492/926] variance_test.cc: Enable HBDMse speed test.

Change-Id: If0226307a6efd704f8a35cb986f570304d698b95
---
 test/variance_test.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index ac6d226a5f..a6c8ef0480 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -488,8 +488,8 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
   }
   vpx_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
-  printf("Variance %dx%d time: %5d ms\n", width(), height(),
-         elapsed_time / 1000);
+  printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(),
+         params_.bit_depth, elapsed_time / 1000);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -871,6 +871,7 @@ TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
 TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); }
 INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDMseTest,
     ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12),

From 71d01660cc40306c2c7c80c8ed510e520a0c4b93 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Fri, 13 Jan 2023 19:46:10 -0800
Subject: [PATCH 493/926] Fix to segfault for external resize test in vp9

Failure occurs for 1 pass non-realtime mode at speed 0.
Due to speed feautre rd_ml_partition.var_pruning, which
doesn't check for scaled reference in simple_motion_search().

Bug: webm:1768

Change-Id: Iddcb56033bac042faebb5196eed788317590b23f
---
 test/resize_test.cc           | 5 +----
 vp9/encoder/vp9_encodeframe.c | 8 +++++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index e122a74742..715bb9d70f 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -777,10 +777,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
 }
 
 VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
-// TODO(https://crbug.com/webm/1768): VP9 should use ONE_PASS_TEST_MODES for
-// the ResizeTest instantiation after segfault is fixed.
-VP9_INSTANTIATE_TEST_SUITE(ResizeTest,
-                           ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
 VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest,
                            ::testing::Values(::libvpx_test::kOnePassBest));
 VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a1ee9c6784..1483ac069d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3413,7 +3413,8 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
-  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref);
+  YV12_BUFFER_CONFIG *yv12;
+  YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
   const int step_param = 1;
   const MvLimits tmp_mv_limits = x->mv_limits;
   const SEARCH_METHODS search_method = NSTEP;
@@ -3422,6 +3423,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   MV best_mv = { 0, 0 };
   int cost_list[5];
 
+  if (scaled_ref_frame)
+    yv12 = scaled_ref_frame;
+  else
+    yv12 = get_ref_frame_buffer(cpi, ref);
+
   assert(yv12 != NULL);
   if (!yv12) return;
   vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,

From 0ce866562fb9c70d5825a6279f3aa3a10f7a9289 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 16 Jan 2023 16:44:04 +0000
Subject: [PATCH 494/926] Refactor Neon implementation of variance functions

Refactor and optimize the Neon implementation of variance functions -
effectively backporting these libaom changes[1,2].

After this change, the only differences between the code in libvpx and
libaom are due to libvpx being compiled with ISO C90, which forbids
mixing declarations and code [-Wdeclaration-after-statement].

[1] https://aomedia-review.googlesource.com/c/aom/+/162241
[2] https://aomedia-review.googlesource.com/c/aom/+/162262

Change-Id: Ia4e8fff4d53297511d1a1e43bca8053bf811e551
---
 vpx_dsp/arm/variance_neon.c | 538 ++++++++++++++++++------------------
 1 file changed, 275 insertions(+), 263 deletions(-)

diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index f9969ed5a4..3ccc4e807b 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -22,236 +22,310 @@
 #if defined(__ARM_FEATURE_DOTPROD)
 
 // Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i;
-  uint32x4_t sum_a = vdupq_n_u32(0);
-  uint32x4_t sum_b = vdupq_n_u32(0);
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
   uint32x4_t sse_u32 = vdupq_n_u32(0);
 
-  for (i = 0; i < h; i += 4) {
-    const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride);
+  int i = h;
+  do {
+    const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
 
-    const uint8x16_t abs_diff = vabdq_u8(a, b);
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
     sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
 
-    sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
-    sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
 
     src_ptr += 4 * src_stride;
     ref_ptr += 4 * ref_stride;
-  }
+    i -= 4;
+  } while (i != 0);
 
-  *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
   *sse = horizontal_add_uint32x4(sse_u32);
 }
 
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride, int w,
-                              int h, uint32_t *sse, int *sum) {
-  int i, j;
-  uint32x4_t sum_a = vdupq_n_u32(0);
-  uint32x4_t sum_b = vdupq_n_u32(0);
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
   uint32x4_t sse_u32 = vdupq_n_u32(0);
 
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 16) {
-      const uint8x16_t a = vld1q_u8(src_ptr + j);
-      const uint8x16_t b = vld1q_u8(ref_ptr + j);
+  int i = h;
+  do {
+    const uint8x16_t s =
+        vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+    const uint8x16_t r =
+        vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
 
-      const uint8x16_t abs_diff = vabdq_u8(a, b);
-      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
 
-      sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
-      sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
-    }
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
   *sse = horizontal_add_uint32x4(sse_u32);
 }
 
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i = 0;
-  uint32x2_t sum_a = vdup_n_u32(0);
-  uint32x2_t sum_b = vdup_n_u32(0);
-  uint32x2_t sse_lo_u32 = vdup_n_u32(0);
-  uint32x2_t sse_hi_u32 = vdup_n_u32(0);
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
 
+  int i = h;
   do {
-    const uint8x8_t a_0 = vld1_u8(src_ptr);
-    const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride);
-    const uint8x8_t b_0 = vld1_u8(ref_ptr);
-    const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride);
-
-    const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0);
-    const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1);
-    sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0);
-    sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1);
-
-    sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1));
-    sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1));
-    sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1));
-    sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1));
-
-    src_ptr += src_stride + src_stride;
-    ref_ptr += ref_stride + ref_stride;
-    i += 2;
-  } while (i < h);
+    int j = 0;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr + j);
+      const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+      const uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      j += 16;
+    } while (j < w);
 
-  *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b)));
-  *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
 }
 
-#else  // !defined(__ARM_FEATURE_DOTPROD)
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
 
-// The variance helper functions use int16_t for sum. 8 values are accumulated
-// and then added (at which point they expand up to int32_t). To avoid overflow,
-// there can be no more than 32767 / 255 ~= 128 values accumulated in each
-// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
-// rows = 128. Asserts have been added to each function to warn against reaching
-// this limit.
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
 
-// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i;
+#else  // !defined(__ARM_FEATURE_DOTPROD)
+
+// Process a block of width 4 two rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
   int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+  int i = h;
 
-  // Since width is only 4, sum_s16 only loads a half row per loop.
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
   assert(h <= 256);
 
-  for (i = 0; i < h; i += 4) {
-    const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    const uint16x8_t diff_lo_u16 =
-        vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
-    const uint16x8_t diff_hi_u16 =
-        vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
-    const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
-    const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
-    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+  do {
+    const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
 
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
-                           vget_low_s16(diff_lo_s16));
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
-                           vget_high_s16(diff_lo_s16));
+    sum_s16 = vaddq_s16(sum_s16, diff);
 
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
-                           vget_low_s16(diff_hi_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
-                           vget_high_s16(diff_hi_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
 
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-  }
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
 
   *sum = horizontal_add_int16x8(sum_s16);
-  *sse = horizontal_add_uint32x4(
-      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+  *sse = (uint32_t)horizontal_add_int32x4(sse_s32);
 }
 
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride, int w,
-                              int h, uint32_t *sse, int *sum) {
-  int i, j;
+// Process a block of width 8 one row at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
   int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
-
-  // The loop loads 16 values at a time but doubles them up when accumulating
-  // into sum_s16.
-  assert(w / 8 * h <= 128);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 16) {
-      const uint8x16_t a_u8 = vld1q_u8(src_ptr + j);
-      const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j);
-
-      const uint16x8_t diff_lo_u16 =
-          vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
-      const uint16x8_t diff_hi_u16 =
-          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
-      const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
-      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
-      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
-                             vget_low_s16(diff_lo_s16));
-      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
-                             vget_high_s16(diff_lo_s16));
-
-      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
-                             vget_low_s16(diff_hi_s16));
-      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
-                             vget_high_s16(diff_hi_s16));
-    }
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int i = h;
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128
+  assert(h <= 128);
+
+  do {
+    const uint8x8_t s = vld1_u8(src_ptr);
+    const uint8x8_t r = vld1_u8(ref_ptr);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
   *sum = horizontal_add_int16x8(sum_s16);
-  *sse = horizontal_add_uint32x4(
-      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i = 0;
-  int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      int h, uint32_t *sse, int *sum) {
+  int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int i = h;
 
-  // Each column has it's own accumulator entry in sum_s16.
+  // Number of rows we can process before 'sum_s16' accumulators overflow:
+  // 32767 / 255 ~= 128, so 128 16-wide rows.
   assert(h <= 128);
 
   do {
-    const uint8x8_t a_0_u8 = vld1_u8(src_ptr);
-    const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride);
-    const uint8x8_t b_0_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride);
-    const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
-    const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
-    const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
-    const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16);
-    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16),
-                           vget_low_s16(diff_0_s16));
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16),
-                           vget_low_s16(diff_1_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16),
-                           vget_high_s16(diff_0_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
-                           vget_high_s16(diff_1_s16));
-    src_ptr += src_stride + src_stride;
-    ref_ptr += ref_stride + ref_stride;
-    i += 2;
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const int16x8_t diff_l =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+    const int16x8_t diff_h =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+    sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+    sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h, int h_limit,
+                                       unsigned int *sse, int *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+  // accumulator overflows. After hitting this limit we accumulate into 32-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+    do {
+      int j = 0;
+      do {
+        const uint8x16_t s = vld1q_u8(src_ptr + j);
+        const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+        const int16x8_t diff_l =
+            vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+        const int16x8_t diff_h =
+            vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+        sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+        sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+        j += 16;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+    h_tmp += h_limit;
   } while (i < h);
 
-  *sum = horizontal_add_int16x8(sum_s16);
-  *sse = horizontal_add_uint32x4(
-      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
 }
 
 #endif  // defined(__ARM_FEATURE_DOTPROD)
@@ -259,103 +333,41 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
                         unsigned int *sse, int *sum) {
-  variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
+  variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
 }
 
 void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *ref_ptr, int ref_stride,
                           unsigned int *sse, int *sum) {
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+  variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum);
 }
 
-#define VARIANCENXM(n, m, shift)                                             \
-  unsigned int vpx_variance##n##x##m##_neon(                                 \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride, unsigned int *sse) {                                   \
-    int sum;                                                                 \
-    if (n == 4)                                                              \
-      variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse,   \
-                         &sum);                                              \
-    else if (n == 8)                                                         \
-      variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse,   \
-                         &sum);                                              \
-    else                                                                     \
-      variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \
-                        &sum);                                               \
-    if (n * m < 16 * 16)                                                     \
-      return *sse - ((sum * sum) >> shift);                                  \
-    else                                                                     \
-      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
+#define VARIANCE_WXH_NEON(w, h, shift)                                        \
+  unsigned int vpx_variance##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum);    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
   }
 
-VARIANCENXM(4, 4, 4)
-VARIANCENXM(4, 8, 5)
-VARIANCENXM(8, 4, 5)
-VARIANCENXM(8, 8, 6)
-VARIANCENXM(8, 16, 7)
-VARIANCENXM(16, 8, 7)
-VARIANCENXM(16, 16, 8)
-VARIANCENXM(16, 32, 9)
-VARIANCENXM(32, 16, 9)
-VARIANCENXM(32, 32, 10)
-
-unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (32 * src_stride), src_stride,
-                    ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
 
-unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
-                    ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
 
-unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
-                    ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride,
-                    ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride,
-                    ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
 
 #if defined(__ARM_FEATURE_DOTPROD)
 

From 5e861795339a7c196a381c3aaf5209b5d4c8c468 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 18 Jan 2023 19:19:01 -0800
Subject: [PATCH 495/926] */Android.mk: add a check for NDK_ROOT

This simplifies integration with the Android platform and avoids the
files from being used when a non-NDK build is performed. In that case
Android.bp is preferred.

Change-Id: I803912146dac788b7f0af27199c7613cabbc9fa0
---
 build/make/Android.mk   | 3 +++
 test/android/Android.mk | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/build/make/Android.mk b/build/make/Android.mk
index b8032e67aa..ba24f541b1 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -8,6 +8,8 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 #
 # This file is to be used for compiling libvpx for Android using the NDK.
 # In an Android project place a libvpx checkout in the jni directory.
@@ -212,3 +214,4 @@ endif
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
 $(call import-module,android/cpufeatures)
 endif
+endif  # NDK_ROOT
diff --git a/test/android/Android.mk b/test/android/Android.mk
index 87155fcb58..9a7533ebba 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk
@@ -10,6 +10,9 @@
 # The test app itself runs on the command line through adb shell
 # The paths are really messed up as the libvpx make file
 # expects to be made from a parent directory.
+
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 CUR_WD := $(call my-dir)
 BINDINGS_DIR := $(CUR_WD)/../../..
 LOCAL_PATH := $(CUR_WD)/../../..
@@ -61,3 +64,4 @@ LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
 # some test files depend on *_rtcd.h, ensure they're generated first.
 $(eval $(call rtcd_dep_template))
 include $(BUILD_EXECUTABLE)
+endif  # NDK_ROOT

From ae4240edc7879b30eddaa9ba38525fcd2a1514a0 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 12 Jan 2023 15:58:00 -0500
Subject: [PATCH 496/926] Add codec control to set per frame QP

Use case is for 1 pass encoding.
Forces max_quantizer = min_quantizer and aq-mode = 0.
Applicalble to spatial layers, where user may set
the QP per spatial layer.

Change-Id: Idfcb7daefde94c475ed1bc0eb8af47c9f309110b
---
 test/vp9_datarate_test.cc | 95 +++++++++++++++++++++++++++++++++++++++
 vp9/encoder/vp9_encoder.c |  4 ++
 vp9/encoder/vp9_encoder.h |  2 +
 vp9/vp9_cx_iface.c        | 20 +++++++++
 vpx/vp8cx.h               | 12 +++++
 5 files changed, 133 insertions(+)

diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index 286fa335a1..eccb001071 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -9,6 +9,7 @@
  */
 #include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -809,6 +810,93 @@ TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) {
       << " The datarate for the file is greater than target by too much!";
 }
 
+using libvpx_test::ACMRandom;
+
+class DatarateTestVP9FrameQp
+    : public DatarateTestVP9,
+      public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
+ public:
+  DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {}
+  virtual ~DatarateTestVP9FrameQp() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    ResetModel();
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    set_cpu_used_ = 7;
+    DatarateTestVP9::PreEncodeFrameHook(video, encoder);
+    ACMRandom rnd;
+    frame_qp_ = static_cast<int>(rnd.RandRange(64));
+    encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_);
+    frame_++;
+  }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    int qp = 0;
+    if (frame_ >= total_frame_) return;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp);
+    ASSERT_EQ(frame_qp_, qp);
+  }
+
+ protected:
+  int total_frame_;
+
+ private:
+  int frame_qp_;
+  int frame_;
+};
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  cfg_.rc_target_bitrate = 200;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 // Params: speed setting.
 class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
@@ -943,6 +1031,13 @@ VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9),
 
 VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10));
 
+#if CONFIG_VP9
+INSTANTIATE_TEST_SUITE_P(
+    VP9, DatarateTestVP9FrameQp,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif
+
 VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV,
                            ::testing::Range(5, 10),
                            ::testing::Values(-5, -10, -15));
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 5cfd846dd0..1c5c9fc923 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1527,6 +1527,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   init_buffer_indices(cpi);
 
   vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+  cpi->fixed_qp_onepass = 0;
 }
 
 void vp9_check_reset_rc_flag(VP9_COMP *cpi) {
@@ -7933,6 +7934,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     vp9_save_layer_context(cpi);
   }
 
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->fixed_qp_onepass = 0;
+
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 3e0b80677e..cca8b53f8e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -971,6 +971,8 @@ typedef struct VP9_COMP {
   RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES];
 #endif
   EXT_RATECTRL ext_ratectrl;
+
+  int fixed_qp_onepass;
 } VP9_COMP;
 
 #if CONFIG_RATE_CTRL
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 695774e730..dee175dc09 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1014,6 +1014,7 @@ static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args);
+  if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1951,6 +1952,24 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const int qp = va_arg(args, int);
+  vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  vpx_codec_err_t res;
+
+  if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM;
+
+  cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp;
+  extra_cfg.aq_mode = 0;
+  cpi->fixed_qp_onepass = 1;
+
+  res = update_extra_cfg(ctx, &extra_cfg);
+  return res;
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -2005,6 +2024,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter },
   { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control },
+  { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass },
 
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index a61238cb10..e0b679fbb7 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -757,6 +757,16 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP8
    */
   VP8E_SET_RTC_EXTERNAL_RATECTRL,
+
+  /*!\brief Codec control to set quantizer for the next frame.
+   *
+   * This will turn off cyclic refresh. Only applicable to 1-pass without
+   * spatial layers.
+   *
+   * Supported in codecs: VP9
+   *
+   */
+  VP9E_SET_QUANTIZER_ONE_PASS,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -1085,6 +1095,8 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
 #define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
 VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
+VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
+#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */

From fcfb471ce2a413e760bdff805c5ae66778cb4169 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 19 Jan 2023 18:02:52 +0000
Subject: [PATCH 497/926] Refactor Neon subpel variance functions

Refactor the Neon implementation of the sub-pixel variance bilinear
filter helper functions - effectively backporting this libaom patch[1].

[1] https://aomedia-review.googlesource.com/c/aom/+/162462

Change-Id: I3dee32e8125250bbeffeb63d1fef5da559bacbf1
---
 vpx_dsp/arm/subpel_variance_neon.c | 271 ++++++++++++++---------------
 1 file changed, 135 insertions(+), 136 deletions(-)

diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index a3befdc348..3fb0acd544 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -17,156 +17,155 @@
 #include "vpx_dsp/variance.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
-static const uint8_t bilinear_filters[8][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
-
 // Process a block exactly 4 wide and a multiple of 2 high.
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; i += 2) {
-    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
-    const uint8x8_t src_1 =
-        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    src_ptr += 2 * src_pixels_per_line;
-    output_ptr += 8;
-  }
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    i -= 2;
+  } while (i != 0);
 }
 
 // Process a block exactly 8 wide and any height.
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    src_ptr += src_pixels_per_line;
-    output_ptr += 8;
-  }
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+  } while (--i != 0);
 }
 
 // Process a block which is a mutiple of 16 wide and any height.
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
-                                       uint8_t *output_ptr,
-                                       unsigned int src_pixels_per_line,
-                                       int pixel_step,
-                                       unsigned int output_height,
-                                       unsigned int output_width,
-                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i, j;
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 16) {
-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
-    }
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+                                         uint8_t *dst_ptr, int src_stride,
+                                         int pixel_step, int dst_width,
+                                         int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l =
+          vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+      uint16x8_t blend_h =
+          vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+      uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
+      uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
+      vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
 }
 
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define SUB_PIXEL_VARIANCENXM(n, m)                                         \
-  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                          \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {              \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                              \
-    uint8_t temp1[n * m];                                                   \
-                                                                            \
-    if (n == 4) {                                                           \
-      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else if (n == 8) {                                                    \
-      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else {                                                                \
-      var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
-                                 bilinear_filters[x_offset]);               \
-      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,                  \
-                                 bilinear_filters[y_offset]);               \
-    }                                                                       \
-    return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse);       \
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+                               dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+                               dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+                               dst_height, filter_offset);
+}
+
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                   \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
+    uint8_t tmp0[w * (h + padding)];                                     \
+    uint8_t tmp1[w * h];                                                 \
+    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                xoffset);                                \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+    return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
-SUB_PIXEL_VARIANCENXM(4, 4)
-SUB_PIXEL_VARIANCENXM(4, 8)
-SUB_PIXEL_VARIANCENXM(8, 4)
-SUB_PIXEL_VARIANCENXM(8, 8)
-SUB_PIXEL_VARIANCENXM(8, 16)
-SUB_PIXEL_VARIANCENXM(16, 8)
-SUB_PIXEL_VARIANCENXM(16, 16)
-SUB_PIXEL_VARIANCENXM(16, 32)
-SUB_PIXEL_VARIANCENXM(32, 16)
-SUB_PIXEL_VARIANCENXM(32, 32)
-SUB_PIXEL_VARIANCENXM(32, 64)
-SUB_PIXEL_VARIANCENXM(64, 32)
-SUB_PIXEL_VARIANCENXM(64, 64)
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.
-#define SUB_PIXEL_AVG_VARIANCENXM(n, m)                                     \
-  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(                      \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                \
-      const uint8_t *second_pred) {                                         \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                              \
-    uint8_t temp1[n * m];                                                   \
-                                                                            \
-    if (n == 4) {                                                           \
-      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else if (n == 8) {                                                    \
-      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else {                                                                \
-      var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
-                                 bilinear_filters[x_offset]);               \
-      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,                  \
-                                 bilinear_filters[y_offset]);               \
-    }                                                                       \
-                                                                            \
-    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);                  \
-                                                                            \
-    return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse);       \
+#define SUB_PIXEL_AVG_VARIANCENXM(n, m)                                       \
+  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(                        \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                  \
+      const uint8_t *second_pred) {                                           \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                                \
+    uint8_t temp1[n * m];                                                     \
+                                                                              \
+    if (n == 4) {                                                             \
+      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),       \
+                                x_offset);                                    \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m, y_offset);             \
+    } else if (n == 8) {                                                      \
+      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),       \
+                                x_offset);                                    \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m, y_offset);             \
+    } else {                                                                  \
+      var_filter_block2d_bil_large(src_ptr, temp0, src_stride, 1, n, (m + 1), \
+                                   x_offset);                                 \
+      var_filter_block2d_bil_large(temp0, temp1, n, n, n, m, y_offset);       \
+    }                                                                         \
+                                                                              \
+    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);                    \
+                                                                              \
+    return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse);         \
   }
 
 SUB_PIXEL_AVG_VARIANCENXM(4, 4)

From ae5b60cb4730639fc7742df577600ce71ddb5936 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 20 Jan 2023 10:35:34 +0000
Subject: [PATCH 498/926] Specialize Neon subpel variance by filter value for
 large blocks

The optimal implementation of the bilinear interpolation depends on
the filter values being used. For both horizontal and vertical
interpolation this can simplify to just taking the source values, or
averaging the source and reference values - which can be computed
more easily than a bilinear interpolation with arbitrary filter
values.

This patch introduces tests to find the most optimal bilinear
interpolation implementation based on the filter values being used.
This new specialization is only used for larger block sizes
(>= 16x16) as we need to be doing enough work to make the cost of
finding the optimal implementation worth it.

This is a backport of this libaom change[1].

After this change, the only differences between the code in libvpx and
libaom are due to libvpx being compiled with ISO C90, which forbids
mixing declarations and code [-Wdeclaration-after-statement].

[1] https://aomedia-review.googlesource.com/c/aom/+/162463

Change-Id: Ia818e148f6fd126656e8411d59c184b55dd43094
---
 vpx_dsp/arm/subpel_variance_neon.c | 95 +++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 8 deletions(-)

diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 3fb0acd544..60650b703a 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -107,6 +107,30 @@ static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
                                dst_height, filter_offset);
 }
 
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                   int src_stride, int pixel_step,
+                                   int dst_width, int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
   unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                   \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
@@ -119,6 +143,61 @@ static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
     return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                \
+    if (xoffset == 0) {                                                       \
+      if (yoffset == 0) {                                                     \
+        return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);       \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      } else {                                                                \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,      \
+                                    yoffset);                                 \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      }                                                                       \
+    } else if (xoffset == 4) {                                                \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);               \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    } else {                                                                  \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);    \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
+                                    xoffset);                                 \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
+                                    xoffset);                                 \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    }                                                                         \
+  }
+
 // 4x<h> blocks are processed two rows at a time, so require an extra row of
 // padding.
 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
@@ -128,16 +207,16 @@ SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
 SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
-SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
-SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
-SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
-SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
-SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.

From b7f6c641397eb1ddac6fcaf34ec6db8fa0cbd7e7 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 20 Jan 2023 11:21:02 +0000
Subject: [PATCH 499/926] Refactor Neon averaging subpel variance functions

Merge the computation of vpx_comp_avg_pred into the second pass of the
bilinear filter - avoiding the overhead of loading and storing the
entire block again.

This is a backport of this libaom change[1].

[1] https://aomedia-review.googlesource.com/c/aom/+/166961

Change-Id: I9327ff7382a46d50c42a5213a11379b957146372
---
 vpx_dsp/arm/subpel_variance_neon.c | 188 +++++++++++++++++++++++------
 1 file changed, 148 insertions(+), 40 deletions(-)

diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 60650b703a..237f7fad25 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -218,45 +218,153 @@ SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
 
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define SUB_PIXEL_AVG_VARIANCENXM(n, m)                                       \
-  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(                        \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,     \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                  \
-      const uint8_t *second_pred) {                                           \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                                \
-    uint8_t temp1[n * m];                                                     \
-                                                                              \
-    if (n == 4) {                                                             \
-      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),       \
-                                x_offset);                                    \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m, y_offset);             \
-    } else if (n == 8) {                                                      \
-      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),       \
-                                x_offset);                                    \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m, y_offset);             \
-    } else {                                                                  \
-      var_filter_block2d_bil_large(src_ptr, temp0, src_stride, 1, n, (m + 1), \
-                                   x_offset);                                 \
-      var_filter_block2d_bil_large(temp0, temp1, n, n, n, m, y_offset);       \
-    }                                                                         \
-                                                                              \
-    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);                    \
-                                                                              \
-    return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse);         \
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+    second_pred += 8;
+  } while (--i > 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l =
+          vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+      uint16x8_t blend_h =
+          vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 16, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 32, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 64, dst_height,
+                                        filter_offset, second_pred);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred) {                                         \
+    uint8_t tmp0[w * (h + padding)];                                        \
+    uint8_t tmp1[w * h];                                                    \
+    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+                                xoffset);                                   \
+    avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
+                                         second_pred);                      \
+    return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
   }
 
-SUB_PIXEL_AVG_VARIANCENXM(4, 4)
-SUB_PIXEL_AVG_VARIANCENXM(4, 8)
-SUB_PIXEL_AVG_VARIANCENXM(8, 4)
-SUB_PIXEL_AVG_VARIANCENXM(8, 8)
-SUB_PIXEL_AVG_VARIANCENXM(8, 16)
-SUB_PIXEL_AVG_VARIANCENXM(16, 8)
-SUB_PIXEL_AVG_VARIANCENXM(16, 16)
-SUB_PIXEL_AVG_VARIANCENXM(16, 32)
-SUB_PIXEL_AVG_VARIANCENXM(32, 16)
-SUB_PIXEL_AVG_VARIANCENXM(32, 32)
-SUB_PIXEL_AVG_VARIANCENXM(32, 64)
-SUB_PIXEL_AVG_VARIANCENXM(64, 32)
-SUB_PIXEL_AVG_VARIANCENXM(64, 64)
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)

From 67abc6738942fff8299919e736138679d4a08016 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 20 Jan 2023 11:42:06 +0000
Subject: [PATCH 500/926] Specialize Neon averaging subpel variance by filter
 value

Use the same specialization for averaging subpel variance functions
as used for the non-averaging variants. The rationale for the
specialization is as follows:

The optimal implementation of the bilinear interpolation depends on
the filter values being used. For both horizontal and vertical
interpolation this can simplify to just taking the source values, or
averaging the source and reference values - which can be computed
more easily than a bilinear interpolation with arbitrary filter
values.

This patch introduces tests to find the most optimal bilinear
interpolation implementation based on the filter values being used.
This new specialization is only used for larger block sizes

This is a backport of this libaom change[1].

After this change, the only differences between the code in libvpx and
libaom are due to libvpx being compiled with ISO C90, which forbids
mixing declarations and code [-Wdeclaration-after-statement].

[1] https://aomedia-review.googlesource.com/c/aom/+/166962

Change-Id: I7860c852db94a7c9c3d72ae4411316685f3800a4
---
 vpx_dsp/arm/subpel_variance_neon.c | 136 +++++++++++++++++++++++++++--
 1 file changed, 128 insertions(+), 8 deletions(-)

diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 237f7fad25..9328c3ed89 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -335,6 +335,66 @@ static void avg_pred_var_filter_block2d_bil_w64(
                                         filter_offset, second_pred);
 }
 
+// Combine averaging subpel filter with vpx_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+                                            uint8_t *dst_ptr, int src_stride,
+                                            int pixel_step, int dst_width,
+                                            int dst_height,
+                                            const uint8_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      avg = vrhaddq_u8(avg, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+                     int dst_width, int dst_height,
+                     const uint8_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src_ptr + j);
+      uint8x16_t p = vld1q_u8(second_pred);
+
+      uint8x16_t avg = vrhaddq_u8(s, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
   unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                  \
       const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
@@ -349,6 +409,66 @@ static void avg_pred_var_filter_block2d_bil_w64(
     return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
   }
 
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                     \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp[w * h];                                                      \
+      if (yoffset == 0) {                                                      \
+        avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      } else if (yoffset == 4) {                                               \
+        avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
+                                        source_stride, w, h, second_pred);     \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      } else {                                                                 \
+        avg_pred_var_filter_block2d_bil_w##w(                                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
+                                        second_pred);                          \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
+                                             xoffset, second_pred);            \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    }                                                                          \
+  }
+
 // 4x<h> blocks are processed two rows at a time, so require an extra row of
 // padding.
 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
@@ -358,13 +478,13 @@ SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
 
-SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
 
-SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
 
-SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)

From 72cfcdd95ab0d17c4b8b13d9da00d1458105bf80 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 24 Jan 2023 14:08:17 -0500
Subject: [PATCH 501/926] Skip calculating internal stats when frame dropped

Bug: webm:1771
Change-Id: I30cd5b7ec0945b521a1cc03999d39ec6a25f1696
---
 vp9/encoder/vp9_encoder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 1c5c9fc923..b66fdc0bca 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -7945,7 +7945,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #if CONFIG_INTERNAL_STATS
 
-  if (oxcf->pass != 1) {
+  if (oxcf->pass != 1 && !cpi->last_frame_dropped) {
     double samples = 0.0;
     cpi->bytes += (int)(*size);
 

From 3384b83da0856df86bab0811e7b5a3495925ac70 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 24 Jan 2023 20:48:06 +0000
Subject: [PATCH 502/926] [NEON] Add Highbd FHT 8x8/16x16 functions

In total this gives about 9% extra performance for both rt/best
profiles.
Furthermore, add transpose_s32 16x16 function

Change-Id: Ib6f368bbb9af7f03c9ce0deba1664cef77632fe2
---
 test/dct_test.cc                    |   4 +
 vp9/common/vp9_rtcd_defs.pl         |   2 +
 vp9/encoder/arm/neon/vp9_dct_neon.c | 942 ++++++++++++++++++++++++++++
 vpx_dsp/arm/fdct16x16_neon.c        | 306 +++++++++
 vpx_dsp/arm/fdct16x16_neon.h        | 306 ---------
 vpx_dsp/arm/fdct8x8_neon.h          |  78 +--
 vpx_dsp/arm/fdct_neon.h             | 119 ++++
 vpx_dsp/arm/transpose_neon.h        |  62 ++
 8 files changed, 1437 insertions(+), 382 deletions(-)

diff --git a/test/dct_test.cc b/test/dct_test.cc
index 0304029bd2..9a150a24f1 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -671,8 +671,12 @@ static const FuncInfo ht_neon_func_info[] = {
     4, 2 },
   { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
     2 },
+  { &vp9_highbd_fht8x8_neon, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>,
+    8, 2 },
   { &vp9_highbd_fht16x16_c,
     &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
+  { &vp9_highbd_fht16x16_neon,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
 #endif
   { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
   { &vp9_fht4x4_neon, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f4bd9772c3..20a482c85f 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -206,8 +206,10 @@ ()
   specialize qw/vp9_highbd_fht4x4 neon/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht8x8 neon/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht16x16 neon/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index 5961be5f31..997b5477e1 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -20,6 +20,7 @@
 #include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/fdct4x4_neon.h"
 #include "vpx_dsp/arm/fdct8x8_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
 
 static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
                                    int stride) {
@@ -1228,4 +1229,945 @@ void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
   }
 }
 
+static INLINE void highbd_load_buffer_8x8(const int16_t *input,
+                                          int32x4_t *lo /*[8]*/,
+                                          int32x4_t *hi /*[8]*/, int stride) {
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+  lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi,
+                                          const int bit) {
+  int32x4_t sign_lo[8], sign_hi[8];
+  sign_lo[0] = vshrq_n_s32(lo[0], 31);
+  sign_hi[0] = vshrq_n_s32(hi[0], 31);
+  sign_lo[1] = vshrq_n_s32(lo[1], 31);
+  sign_hi[1] = vshrq_n_s32(hi[1], 31);
+  sign_lo[2] = vshrq_n_s32(lo[2], 31);
+  sign_hi[2] = vshrq_n_s32(hi[2], 31);
+  sign_lo[3] = vshrq_n_s32(lo[3], 31);
+  sign_hi[3] = vshrq_n_s32(hi[3], 31);
+  sign_lo[4] = vshrq_n_s32(lo[4], 31);
+  sign_hi[4] = vshrq_n_s32(hi[4], 31);
+  sign_lo[5] = vshrq_n_s32(lo[5], 31);
+  sign_hi[5] = vshrq_n_s32(hi[5], 31);
+  sign_lo[6] = vshrq_n_s32(lo[6], 31);
+  sign_hi[6] = vshrq_n_s32(hi[6], 31);
+  sign_lo[7] = vshrq_n_s32(lo[7], 31);
+  sign_hi[7] = vshrq_n_s32(hi[7], 31);
+
+  if (bit == 2) {
+    const int32x4_t const_rounding = vdupq_n_s32(1);
+    lo[0] = vaddq_s32(lo[0], const_rounding);
+    hi[0] = vaddq_s32(hi[0], const_rounding);
+    lo[1] = vaddq_s32(lo[1], const_rounding);
+    hi[1] = vaddq_s32(hi[1], const_rounding);
+    lo[2] = vaddq_s32(lo[2], const_rounding);
+    hi[2] = vaddq_s32(hi[2], const_rounding);
+    lo[3] = vaddq_s32(lo[3], const_rounding);
+    hi[3] = vaddq_s32(hi[3], const_rounding);
+    lo[4] = vaddq_s32(lo[4], const_rounding);
+    hi[4] = vaddq_s32(hi[4], const_rounding);
+    lo[5] = vaddq_s32(lo[5], const_rounding);
+    hi[5] = vaddq_s32(hi[5], const_rounding);
+    lo[6] = vaddq_s32(lo[6], const_rounding);
+    hi[6] = vaddq_s32(hi[6], const_rounding);
+    lo[7] = vaddq_s32(lo[7], const_rounding);
+    hi[7] = vaddq_s32(hi[7], const_rounding);
+  }
+
+  lo[0] = vsubq_s32(lo[0], sign_lo[0]);
+  hi[0] = vsubq_s32(hi[0], sign_hi[0]);
+  lo[1] = vsubq_s32(lo[1], sign_lo[1]);
+  hi[1] = vsubq_s32(hi[1], sign_hi[1]);
+  lo[2] = vsubq_s32(lo[2], sign_lo[2]);
+  hi[2] = vsubq_s32(hi[2], sign_hi[2]);
+  lo[3] = vsubq_s32(lo[3], sign_lo[3]);
+  hi[3] = vsubq_s32(hi[3], sign_hi[3]);
+  lo[4] = vsubq_s32(lo[4], sign_lo[4]);
+  hi[4] = vsubq_s32(hi[4], sign_hi[4]);
+  lo[5] = vsubq_s32(lo[5], sign_lo[5]);
+  hi[5] = vsubq_s32(hi[5], sign_hi[5]);
+  lo[6] = vsubq_s32(lo[6], sign_lo[6]);
+  hi[6] = vsubq_s32(hi[6], sign_hi[6]);
+  lo[7] = vsubq_s32(lo[7], sign_lo[7]);
+  hi[7] = vsubq_s32(hi[7], sign_hi[7]);
+
+  if (bit == 1) {
+    lo[0] = vshrq_n_s32(lo[0], 1);
+    hi[0] = vshrq_n_s32(hi[0], 1);
+    lo[1] = vshrq_n_s32(lo[1], 1);
+    hi[1] = vshrq_n_s32(hi[1], 1);
+    lo[2] = vshrq_n_s32(lo[2], 1);
+    hi[2] = vshrq_n_s32(hi[2], 1);
+    lo[3] = vshrq_n_s32(lo[3], 1);
+    hi[3] = vshrq_n_s32(hi[3], 1);
+    lo[4] = vshrq_n_s32(lo[4], 1);
+    hi[4] = vshrq_n_s32(hi[4], 1);
+    lo[5] = vshrq_n_s32(lo[5], 1);
+    hi[5] = vshrq_n_s32(hi[5], 1);
+    lo[6] = vshrq_n_s32(lo[6], 1);
+    hi[6] = vshrq_n_s32(hi[6], 1);
+    lo[7] = vshrq_n_s32(lo[7], 1);
+    hi[7] = vshrq_n_s32(hi[7], 1);
+  } else {
+    lo[0] = vshrq_n_s32(lo[0], 2);
+    hi[0] = vshrq_n_s32(hi[0], 2);
+    lo[1] = vshrq_n_s32(lo[1], 2);
+    hi[1] = vshrq_n_s32(hi[1], 2);
+    lo[2] = vshrq_n_s32(lo[2], 2);
+    hi[2] = vshrq_n_s32(hi[2], 2);
+    lo[3] = vshrq_n_s32(lo[3], 2);
+    hi[3] = vshrq_n_s32(hi[3], 2);
+    lo[4] = vshrq_n_s32(lo[4], 2);
+    hi[4] = vshrq_n_s32(hi[4], 2);
+    lo[5] = vshrq_n_s32(lo[5], 2);
+    hi[5] = vshrq_n_s32(hi[5], 2);
+    lo[6] = vshrq_n_s32(lo[6], 2);
+    hi[6] = vshrq_n_s32(hi[6], 2);
+    lo[7] = vshrq_n_s32(lo[7], 2);
+    hi[7] = vshrq_n_s32(hi[7], 2);
+  }
+}
+
+static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo,
+                                           int32x4_t *hi, int stride) {
+  vst1q_s32(output + 0 * stride, lo[0]);
+  vst1q_s32(output + 0 * stride + 4, hi[0]);
+  vst1q_s32(output + 1 * stride, lo[1]);
+  vst1q_s32(output + 1 * stride + 4, hi[1]);
+  vst1q_s32(output + 2 * stride, lo[2]);
+  vst1q_s32(output + 2 * stride + 4, hi[2]);
+  vst1q_s32(output + 3 * stride, lo[3]);
+  vst1q_s32(output + 3 * stride + 4, hi[3]);
+  vst1q_s32(output + 4 * stride, lo[4]);
+  vst1q_s32(output + 4 * stride + 4, hi[4]);
+  vst1q_s32(output + 5 * stride, lo[5]);
+  vst1q_s32(output + 5 * stride + 4, hi[5]);
+  vst1q_s32(output + 6 * stride, lo[6]);
+  vst1q_s32(output + 6 * stride + 4, hi[6]);
+  vst1q_s32(output + 7 * stride, lo[7]);
+  vst1q_s32(output + 7 * stride + 4, hi[7]);
+}
+
+static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/,
+                                        int32x4_t *hi /*[8]*/) {
+  int32x4_t s_lo[8], s_hi[8];
+  int32x4_t t_lo[8], t_hi[8];
+  int32x4_t x_lo[8], x_hi[8];
+  int64x2_t s64_lo[16], s64_hi[16];
+
+  x_lo[0] = lo[7];
+  x_hi[0] = hi[7];
+  x_lo[1] = lo[0];
+  x_hi[1] = hi[0];
+  x_lo[2] = lo[5];
+  x_hi[2] = hi[5];
+  x_lo[3] = lo[2];
+  x_hi[3] = hi[2];
+  x_lo[4] = lo[3];
+  x_hi[4] = hi[3];
+  x_lo[5] = lo[4];
+  x_hi[5] = hi[4];
+  x_lo[6] = lo[1];
+  x_hi[6] = hi[1];
+  x_lo[7] = lo[6];
+  x_hi[7] = hi[6];
+
+  // stage 1
+  // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64,
+      &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+  // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64,
+      &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+
+  // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+  // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+  // fdct_round_shift, indices are doubled
+  t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+  t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+  t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+  t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+  t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+  t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+  t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+  t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+  t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+  t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+  t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+  t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+  // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+  // fdct_round_shift
+  // s0 + s2
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s0 - s2
+  t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+
+  // s1 + s3
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // s1 - s3
+  t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+
+  // s4 + s6
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s4 - s6
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+
+  // s5 + s7
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s5 - s7
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+
+  // stage 3
+  // s2 = cospi_16_64 * (x2 + x3)
+  // s3 = cospi_16_64 * (x2 - x3)
+  butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64,
+                               &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]);
+
+  // s6 = cospi_16_64 * (x6 + x7)
+  // s7 = cospi_16_64 * (x6 - x7)
+  butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64,
+                               &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]);
+
+  // x0, x2, x4, x6 pass through
+  lo[0] = t_lo[0];
+  hi[0] = t_hi[0];
+  lo[2] = s_lo[6];
+  hi[2] = s_hi[6];
+  lo[4] = s_lo[3];
+  hi[4] = s_hi[3];
+  lo[6] = t_lo[5];
+  hi[6] = t_hi[5];
+
+  lo[1] = vnegq_s32(t_lo[4]);
+  hi[1] = vnegq_s32(t_hi[4]);
+  lo[3] = vnegq_s32(s_lo[2]);
+  hi[3] = vnegq_s32(s_hi[2]);
+  lo[5] = vnegq_s32(s_lo[7]);
+  hi[5] = vnegq_s32(s_hi[7]);
+  lo[7] = vnegq_s32(t_lo[1]);
+  hi[7] = vnegq_s32(t_hi[1]);
+
+  transpose_s32_8x8_2(lo, hi, lo, hi);
+}
+
+void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  int32x4_t lo[8], hi[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      highbd_fadst8x8_neon(lo, hi);
+      // pass1 variant is not precise enough
+      vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      // pass1 variant is not precise enough
+      vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+  }
+}
+
+static INLINE void highbd_load_buffer_16x16(
+    const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+    int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+  // load first 8 columns
+  highbd_load_buffer_8x8(input, left1, right1, stride);
+  highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  highbd_load_buffer_8x8(input, left2, right2, stride);
+  highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_write_buffer_16x16(
+    tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+    int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+  // write first 8 columns
+  highbd_write_buffer_8x8(output, left1, right1, stride);
+  highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride);
+
+  // write second 8 columns
+  output += 8;
+  highbd_write_buffer_8x8(output, left2, right2, stride);
+  highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/,
+                                            int32x4_t *right1 /*[16]*/,
+                                            int32x4_t *left2 /*[16]*/,
+                                            int32x4_t *right2 /*[16]*/,
+                                            const int bit) {
+  // perform rounding operations
+  highbd_right_shift_8x8(left1, right1, bit);
+  highbd_right_shift_8x8(left1 + 8, right1 + 8, bit);
+  highbd_right_shift_8x8(left2, right2, bit);
+  highbd_right_shift_8x8(left2 + 8, right2 + 8, bit);
+}
+
+static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) {
+  // perform 16x16 1-D DCT for 8 columns
+  int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8];
+  int32x4_t left8[8], right8[8];
+
+  // stage 1
+  left8[0] = vaddq_s32(left[0], left[15]);
+  right8[0] = vaddq_s32(right[0], right[15]);
+  left8[1] = vaddq_s32(left[1], left[14]);
+  right8[1] = vaddq_s32(right[1], right[14]);
+  left8[2] = vaddq_s32(left[2], left[13]);
+  right8[2] = vaddq_s32(right[2], right[13]);
+  left8[3] = vaddq_s32(left[3], left[12]);
+  right8[3] = vaddq_s32(right[3], right[12]);
+  left8[4] = vaddq_s32(left[4], left[11]);
+  right8[4] = vaddq_s32(right[4], right[11]);
+  left8[5] = vaddq_s32(left[5], left[10]);
+  right8[5] = vaddq_s32(right[5], right[10]);
+  left8[6] = vaddq_s32(left[6], left[9]);
+  right8[6] = vaddq_s32(right[6], right[9]);
+  left8[7] = vaddq_s32(left[7], left[8]);
+  right8[7] = vaddq_s32(right[7], right[8]);
+
+  // step 1
+  s1_lo[0] = vsubq_s32(left[7], left[8]);
+  s1_hi[0] = vsubq_s32(right[7], right[8]);
+  s1_lo[1] = vsubq_s32(left[6], left[9]);
+  s1_hi[1] = vsubq_s32(right[6], right[9]);
+  s1_lo[2] = vsubq_s32(left[5], left[10]);
+  s1_hi[2] = vsubq_s32(right[5], right[10]);
+  s1_lo[3] = vsubq_s32(left[4], left[11]);
+  s1_hi[3] = vsubq_s32(right[4], right[11]);
+  s1_lo[4] = vsubq_s32(left[3], left[12]);
+  s1_hi[4] = vsubq_s32(right[3], right[12]);
+  s1_lo[5] = vsubq_s32(left[2], left[13]);
+  s1_hi[5] = vsubq_s32(right[2], right[13]);
+  s1_lo[6] = vsubq_s32(left[1], left[14]);
+  s1_hi[6] = vsubq_s32(right[1], right[14]);
+  s1_lo[7] = vsubq_s32(left[0], left[15]);
+  s1_hi[7] = vsubq_s32(right[0], right[15]);
+
+  // pass1 variant is not accurate enough
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8);
+
+  // step 2
+  // step2[2] = (step1[5] - step1[2]) * cospi_16_64;
+  // step2[5] = (step1[5] + step1[2]) * cospi_16_64;
+  butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+                                     cospi_16_64, &s2_lo[5], &s2_hi[5],
+                                     &s2_lo[2], &s2_hi[2]);
+  // step2[3] = (step1[4] - step1[3]) * cospi_16_64;
+  // step2[4] = (step1[4] + step1[3]) * cospi_16_64;
+  butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+                                     cospi_16_64, &s2_lo[4], &s2_hi[4],
+                                     &s2_lo[3], &s2_hi[3]);
+
+  // step 3
+  s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]);
+  s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]);
+  s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]);
+  s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]);
+  s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]);
+  s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]);
+  s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]);
+  s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]);
+  s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]);
+  s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]);
+  s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]);
+  s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]);
+  s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]);
+  s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]);
+  s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]);
+  s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]);
+
+  // step 4
+  // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1]
+  // s2[6] = cospi_8_64 * s3[6]  + cospi_24_64 * s3[1]
+  butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1],
+                                     cospi_8_64, cospi_24_64, &s2_lo[6],
+                                     &s2_hi[6], &s2_lo[1], &s2_hi[1]);
+
+  // s2[5] =  cospi_8_64 * s3[2] - cospi_24_64 * s3[5]
+  // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5]
+  butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5],
+                                     cospi_24_64, cospi_8_64, &s2_lo[2],
+                                     &s2_hi[2], &s2_lo[5], &s2_hi[5]);
+
+  // step 5
+  s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]);
+  s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]);
+  s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]);
+  s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]);
+  s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]);
+  s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]);
+  s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]);
+  s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]);
+  s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]);
+  s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]);
+  s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]);
+  s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]);
+  s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]);
+  s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]);
+  s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]);
+  s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]);
+
+  // step 6
+  // out[1]  = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+  // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+
+  // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+  // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+
+  // out[5]  = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+  // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+
+  // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+  // out[3]  = step1[4] * cospi_6_64  - step1[3] * cospi_26_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+
+  left[0] = left8[0];
+  right[0] = right8[0];
+  left[2] = left8[1];
+  right[2] = right8[1];
+  left[4] = left8[2];
+  right[4] = right8[2];
+  left[6] = left8[3];
+  right[6] = right8[3];
+  left[8] = left8[4];
+  right[8] = right8[4];
+  left[10] = left8[5];
+  right[10] = right8[5];
+  left[12] = left8[6];
+  right[12] = right8[6];
+  left[14] = left8[7];
+  right[14] = right8[7];
+}
+
+static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) {
+  // perform 16x16 1-D ADST for 8 columns
+  int32x4_t x_lo[16], x_hi[16];
+  int32x4_t s_lo[16], s_hi[16];
+  int32x4_t t_lo[16], t_hi[16];
+  int64x2_t s64_lo[32], s64_hi[32];
+
+  x_lo[0] = left[15];
+  x_hi[0] = right[15];
+  x_lo[1] = left[0];
+  x_hi[1] = right[0];
+  x_lo[2] = left[13];
+  x_hi[2] = right[13];
+  x_lo[3] = left[2];
+  x_hi[3] = right[2];
+  x_lo[4] = left[11];
+  x_hi[4] = right[11];
+  x_lo[5] = left[4];
+  x_hi[5] = right[4];
+  x_lo[6] = left[9];
+  x_hi[6] = right[9];
+  x_lo[7] = left[6];
+  x_hi[7] = right[6];
+  x_lo[8] = left[7];
+  x_hi[8] = right[7];
+  x_lo[9] = left[8];
+  x_hi[9] = right[8];
+  x_lo[10] = left[5];
+  x_hi[10] = right[5];
+  x_lo[11] = left[10];
+  x_hi[11] = right[10];
+  x_lo[12] = left[3];
+  x_hi[12] = right[3];
+  x_lo[13] = left[12];
+  x_hi[13] = right[12];
+  x_lo[14] = left[1];
+  x_hi[14] = right[1];
+  x_lo[15] = left[14];
+  x_hi[15] = right[14];
+
+  // stage 1, indices are doubled
+  // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+  // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64,
+      &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+  // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+  // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64,
+      &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+  // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+  // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+  // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+  // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+  // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+  // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64,
+      &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+  // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+  // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64,
+      &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+  // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+  // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64,
+      &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+  // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+  // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64,
+      &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]);
+
+  // fdct_round_shift, indices are doubled
+  t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+  t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+  t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+  t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+  t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+  t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+  t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+  t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+  t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+  t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+  t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+  t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+  t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+  t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+  t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+  t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+  t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+  t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+  t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+  t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+  t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+  t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+  t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+  t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  s_lo[4] = t_lo[4];
+  s_hi[4] = t_hi[4];
+  s_lo[5] = t_lo[5];
+  s_hi[5] = t_hi[5];
+  s_lo[6] = t_lo[6];
+  s_hi[6] = t_hi[6];
+  s_lo[7] = t_lo[7];
+  s_hi[7] = t_hi[7];
+  // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64,
+      &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+  // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64,
+      &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+  // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64,
+      &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]);
+  // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64,
+      &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+  // s0 + s4
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]);
+  // s1 + s5
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]);
+  // s2 + s6
+  t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]);
+  t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]);
+  // s3 + s7
+  t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]);
+  t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+  // s0 - s4
+  t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]);
+  t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]);
+  // s1 - s5
+  t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]);
+  t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]);
+  // s2 - s6
+  t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]);
+  t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]);
+  // s3 - s7
+  t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]);
+  t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+  // fdct_round_shift()
+  // s8 + s12
+  t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+  t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+  // s9 + s13
+  t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+  t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+  // s10 + s14
+  t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+  t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+  // s11 + s15
+  t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+  t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+  // s8 - s12
+  t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+  t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+  // s9 - s13
+  t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+  t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+  // s10 - s14
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+  // s11 - s15
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+  // stage 3
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+  // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]);
+  s_lo[8] = t_lo[8];
+  s_hi[8] = t_hi[8];
+  s_lo[9] = t_lo[9];
+  s_hi[9] = t_hi[9];
+  s_lo[10] = t_lo[10];
+  s_hi[10] = t_hi[10];
+  s_lo[11] = t_lo[11];
+  s_hi[11] = t_hi[11];
+  // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+  // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+  // s0 + s2
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // s0 - s2
+  t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // fdct_round_shift()
+  // s4 + s6
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s5 + s7
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s4 - s6
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s5 - s7
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s8 + s10
+  t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]);
+  t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]);
+  // s9 + s11
+  t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]);
+  t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]);
+  // s8 - s10
+  t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]);
+  t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]);
+  // s9 - s11
+  t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]);
+  t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]);
+  // fdct_round_shift()
+  // s12 + s14
+  t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+  t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+  // s13 + s15
+  t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+  t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+  // s12 - s14
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+  // s13 - s15
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+
+  // stage 4, with fdct_round_shift
+  // s2 = (-cospi_16_64) * (x2 + x3);
+  // s3 = cospi_16_64 * (x2 - x3);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+                                     -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3],
+                                     &x_hi[3]);
+  // s6 = cospi_16_64 * (x6 + x7);
+  // s7 = cospi_16_64 * (-x6 + x7);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                     cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7],
+                                     &x_hi[7]);
+  // s10 = cospi_16_64 * (x10 + x11);
+  // s11 = cospi_16_64 * (-x10 + x11);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+                                     cospi_16_64, &x_lo[10], &x_hi[10],
+                                     &x_lo[11], &x_hi[11]);
+  // s14 = (-cospi_16_64) * (x14 + x15);
+  // s15 = cospi_16_64 * (x14 - x15);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                     -cospi_16_64, &x_lo[14], &x_hi[14],
+                                     &x_lo[15], &x_hi[15]);
+
+  // Just copy x0, x1, x4, x5, x8, x9, x12, x13
+  x_lo[0] = t_lo[0];
+  x_hi[0] = t_hi[0];
+  x_lo[1] = t_lo[1];
+  x_hi[1] = t_hi[1];
+  x_lo[4] = t_lo[4];
+  x_hi[4] = t_hi[4];
+  x_lo[5] = t_lo[5];
+  x_hi[5] = t_hi[5];
+  x_lo[8] = t_lo[8];
+  x_hi[8] = t_hi[8];
+  x_lo[9] = t_lo[9];
+  x_hi[9] = t_hi[9];
+  x_lo[12] = t_lo[12];
+  x_hi[12] = t_hi[12];
+  x_lo[13] = t_lo[13];
+  x_hi[13] = t_hi[13];
+
+  left[0] = x_lo[0];
+  right[0] = x_hi[0];
+  left[1] = vnegq_s32(x_lo[8]);
+  right[1] = vnegq_s32(x_hi[8]);
+  left[2] = x_lo[12];
+  right[2] = x_hi[12];
+  left[3] = vnegq_s32(x_lo[4]);
+  right[3] = vnegq_s32(x_hi[4]);
+  left[4] = x_lo[6];
+  right[4] = x_hi[6];
+  left[5] = x_lo[14];
+  right[5] = x_hi[14];
+  left[6] = x_lo[10];
+  right[6] = x_hi[10];
+  left[7] = x_lo[2];
+  right[7] = x_hi[2];
+  left[8] = x_lo[3];
+  right[8] = x_hi[3];
+  left[9] = x_lo[11];
+  right[9] = x_hi[11];
+  left[10] = x_lo[15];
+  right[10] = x_hi[15];
+  left[11] = x_lo[7];
+  right[11] = x_hi[7];
+  left[12] = x_lo[5];
+  right[12] = x_hi[5];
+  left[13] = vnegq_s32(x_lo[13]);
+  right[13] = vnegq_s32(x_hi[13]);
+  left[14] = x_lo[9];
+  right[14] = x_hi[9];
+  left[15] = vnegq_s32(x_lo[1]);
+  right[15] = vnegq_s32(x_hi[1]);
+}
+
+static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1,
+                                  int32x4_t *left2, int32x4_t *right2) {
+  // Left half.
+  highbd_fdct16_8col(left1, right1);
+  // Right half.
+  highbd_fdct16_8col(left2, right2);
+  transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1,
+                                   int32x4_t *left2, int32x4_t *right2) {
+  // Left half.
+  highbd_fadst16_8col(left1, right1);
+  // Right half.
+  highbd_fadst16_8col(left2, right2);
+  transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output,
+                              int stride, int tx_type) {
+  int32x4_t left1[16], right1[16], left2[16], right2[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fdct16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fdct16x16_neon(left1, right1, left2, right2);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+  }
+}
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index a458ecaa41..8a8aaa1ed4 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -28,6 +28,124 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
 #else
 
+// Main body of fdct16x16.
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+                              int16x8_t *out /*[16]*/) {
+  int16x8_t s[8];
+  int16x8_t x[4];
+  int16x8_t step[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[8]);
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+  //  Stage 3
+  x[0] = vaddq_s16(s[4], s[5]);
+  x[1] = vsubq_s16(s[4], s[5]);
+  x[2] = vsubq_s16(s[7], s[6]);
+  x[3] = vaddq_s16(s[7], s[6]);
+
+  // Stage 4
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+  // step 3
+  s[0] = vaddq_s16(in[8], s[3]);
+  s[1] = vaddq_s16(in[9], s[2]);
+  x[0] = vsubq_s16(in[9], s[2]);
+  x[1] = vsubq_s16(in[8], s[3]);
+  x[2] = vsubq_s16(in[15], s[4]);
+  x[3] = vsubq_s16(in[14], s[5]);
+  s[6] = vaddq_s16(in[14], s[5]);
+  s[7] = vaddq_s16(in[15], s[4]);
+
+  // step 4
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
+
+  // step 5
+  step[0] = vaddq_s16(s[0], s[1]);
+  step[1] = vsubq_s16(s[0], s[1]);
+  step[2] = vaddq_s16(x[1], s[2]);
+  step[3] = vsubq_s16(x[1], s[2]);
+  step[4] = vsubq_s16(x[2], s[5]);
+  step[5] = vaddq_s16(x[2], s[5]);
+  step[6] = vsubq_s16(s[7], s[6]);
+  step[7] = vaddq_s16(s[7], s[6]);
+
+  // step 6
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
+                      &out[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
+                      &out[15]);
+
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
+                      &out[3]);
+
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
+                      &out[11]);
+}
+
 void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[16];
   int16x8_t temp1[16];
@@ -79,6 +197,194 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+                                     int32x4_t *right /* [16] */) {
+  int32x4_t sl[8];
+  int32x4_t sr[8];
+  int32x4_t xl[4];
+  int32x4_t xr[4];
+  int32x4_t inl[8];
+  int32x4_t inr[8];
+  int32x4_t stepl[8];
+  int32x4_t stepr[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // Copy values 8-15 as we're storing in-place
+  inl[0] = left[8];
+  inr[0] = right[8];
+  inl[1] = left[9];
+  inr[1] = right[9];
+  inl[2] = left[10];
+  inr[2] = right[10];
+  inl[3] = left[11];
+  inr[3] = right[11];
+  inl[4] = left[12];
+  inr[4] = right[12];
+  inl[5] = left[13];
+  inr[5] = right[13];
+  inl[6] = left[14];
+  inr[6] = right[14];
+  inl[7] = left[15];
+  inr[7] = right[15];
+
+  // fdct4(step, step);
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[8], &right[8]);
+
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[4], &right[4],
+                                     &left[12], &right[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+                               &sr[6], &sl[5], &sr[5]);
+
+  //  Stage 3
+  xl[0] = vaddq_s32(sl[4], sl[5]);
+  xr[0] = vaddq_s32(sr[4], sr[5]);
+  xl[1] = vsubq_s32(sl[4], sl[5]);
+  xr[1] = vsubq_s32(sr[4], sr[5]);
+  xl[2] = vsubq_s32(sl[7], sl[6]);
+  xr[2] = vsubq_s32(sr[7], sr[6]);
+  xl[3] = vaddq_s32(sl[7], sl[6]);
+  xr[3] = vaddq_s32(sr[7], sr[6]);
+
+  // Stage 4
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[2], &right[2],
+                                     &left[14], &right[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[10], &right[10],
+                                     &left[6], &right[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+                               &sl[5], &sr[5], &sl[2], &sr[2]);
+  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+                               &sl[4], &sr[4], &sl[3], &sr[3]);
+
+  // step 3
+  sl[0] = vaddq_s32(inl[0], sl[3]);
+  sr[0] = vaddq_s32(inr[0], sr[3]);
+  sl[1] = vaddq_s32(inl[1], sl[2]);
+  sr[1] = vaddq_s32(inr[1], sr[2]);
+  xl[0] = vsubq_s32(inl[1], sl[2]);
+  xr[0] = vsubq_s32(inr[1], sr[2]);
+  xl[1] = vsubq_s32(inl[0], sl[3]);
+  xr[1] = vsubq_s32(inr[0], sr[3]);
+  xl[2] = vsubq_s32(inl[7], sl[4]);
+  xr[2] = vsubq_s32(inr[7], sr[4]);
+  xl[3] = vsubq_s32(inl[6], sl[5]);
+  xr[3] = vsubq_s32(inr[6], sr[5]);
+  sl[6] = vaddq_s32(inl[6], sl[5]);
+  sr[6] = vaddq_s32(inr[6], sr[5]);
+  sl[7] = vaddq_s32(inl[7], sl[4]);
+  sr[7] = vaddq_s32(inr[7], sr[4]);
+
+  // step 4
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
+                                     &sr[1]);
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
+                                     &sr[5]);
+
+  // step 5
+  stepl[0] = vaddq_s32(sl[0], sl[1]);
+  stepr[0] = vaddq_s32(sr[0], sr[1]);
+  stepl[1] = vsubq_s32(sl[0], sl[1]);
+  stepr[1] = vsubq_s32(sr[0], sr[1]);
+  stepl[2] = vaddq_s32(xl[1], sl[2]);
+  stepr[2] = vaddq_s32(xr[1], sr[2]);
+  stepl[3] = vsubq_s32(xl[1], sl[2]);
+  stepr[3] = vsubq_s32(xr[1], sr[2]);
+  stepl[4] = vsubq_s32(xl[2], sl[5]);
+  stepr[4] = vsubq_s32(xr[2], sr[5]);
+  stepl[5] = vaddq_s32(xl[2], sl[5]);
+  stepr[5] = vaddq_s32(xr[2], sr[5]);
+  stepl[6] = vsubq_s32(sl[7], sl[6]);
+  stepr[6] = vsubq_s32(sr[7], sr[6]);
+  stepl[7] = vaddq_s32(sl[7], sl[6]);
+  stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+  // step 6
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+}
+
 void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
                                int stride) {
   int16x8_t temp0[16];
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index 43d820b6bd..cd58675ca4 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -159,124 +159,6 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
   a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
 }
 
-// Main body of fdct16x16.
-static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
-                              int16x8_t *out /*[16]*/) {
-  int16x8_t s[8];
-  int16x8_t x[4];
-  int16x8_t step[8];
-
-  // stage 1
-  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
-  // even_results);"
-  s[0] = vaddq_s16(in[0], in[7]);
-  s[1] = vaddq_s16(in[1], in[6]);
-  s[2] = vaddq_s16(in[2], in[5]);
-  s[3] = vaddq_s16(in[3], in[4]);
-  s[4] = vsubq_s16(in[3], in[4]);
-  s[5] = vsubq_s16(in[2], in[5]);
-  s[6] = vsubq_s16(in[1], in[6]);
-  s[7] = vsubq_s16(in[0], in[7]);
-
-  // fdct4(step, step);
-  x[0] = vaddq_s16(s[0], s[3]);
-  x[1] = vaddq_s16(s[1], s[2]);
-  x[2] = vsubq_s16(s[1], s[2]);
-  x[3] = vsubq_s16(s[0], s[3]);
-
-  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
-  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
-                                          &out[8]);
-  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
-  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
-
-  //  Stage 2
-  // Re-using source s5/s6
-  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
-  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
-
-  //  Stage 3
-  x[0] = vaddq_s16(s[4], s[5]);
-  x[1] = vsubq_s16(s[4], s[5]);
-  x[2] = vsubq_s16(s[7], s[6]);
-  x[3] = vaddq_s16(s[7], s[6]);
-
-  // Stage 4
-  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
-  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
-
-  // step 2
-  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
-  // That file distinguished between "in_high" and "step1" but the only
-  // difference is that "in_high" is the first 8 values and "step 1" is the
-  // second. Here, since they are all in one array, "step1" values are += 8.
-
-  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
-  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
-  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
-  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
-
-  // step 3
-  s[0] = vaddq_s16(in[8], s[3]);
-  s[1] = vaddq_s16(in[9], s[2]);
-  x[0] = vsubq_s16(in[9], s[2]);
-  x[1] = vsubq_s16(in[8], s[3]);
-  x[2] = vsubq_s16(in[15], s[4]);
-  x[3] = vsubq_s16(in[14], s[5]);
-  s[6] = vaddq_s16(in[14], s[5]);
-  s[7] = vaddq_s16(in[15], s[4]);
-
-  // step 4
-  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
-  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
-  // * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
-
-  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
-  // cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
-
-  // step 5
-  step[0] = vaddq_s16(s[0], s[1]);
-  step[1] = vsubq_s16(s[0], s[1]);
-  step[2] = vaddq_s16(x[1], s[2]);
-  step[3] = vsubq_s16(x[1], s[2]);
-  step[4] = vsubq_s16(x[2], s[5]);
-  step[5] = vaddq_s16(x[2], s[5]);
-  step[6] = vsubq_s16(s[7], s[6]);
-  step[7] = vaddq_s16(s[7], s[6]);
-
-  // step 6
-  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
-  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
-  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
-                      &out[7]);
-  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
-  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
-  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
-                      &out[15]);
-
-  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
-  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
-  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
-                      &out[3]);
-
-  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
-  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
-  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
-                      &out[11]);
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
 
 static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
@@ -431,194 +313,6 @@ static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
   vst1q_s32(a, b[15]);
 }
 
-// Main body of fdct8x16 column
-static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
-                                     int32x4_t *right /* [16] */) {
-  int32x4_t sl[8];
-  int32x4_t sr[8];
-  int32x4_t xl[4];
-  int32x4_t xr[4];
-  int32x4_t inl[8];
-  int32x4_t inr[8];
-  int32x4_t stepl[8];
-  int32x4_t stepr[8];
-
-  // stage 1
-  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
-  // even_results);"
-  sl[0] = vaddq_s32(left[0], left[7]);
-  sr[0] = vaddq_s32(right[0], right[7]);
-  sl[1] = vaddq_s32(left[1], left[6]);
-  sr[1] = vaddq_s32(right[1], right[6]);
-  sl[2] = vaddq_s32(left[2], left[5]);
-  sr[2] = vaddq_s32(right[2], right[5]);
-  sl[3] = vaddq_s32(left[3], left[4]);
-  sr[3] = vaddq_s32(right[3], right[4]);
-  sl[4] = vsubq_s32(left[3], left[4]);
-  sr[4] = vsubq_s32(right[3], right[4]);
-  sl[5] = vsubq_s32(left[2], left[5]);
-  sr[5] = vsubq_s32(right[2], right[5]);
-  sl[6] = vsubq_s32(left[1], left[6]);
-  sr[6] = vsubq_s32(right[1], right[6]);
-  sl[7] = vsubq_s32(left[0], left[7]);
-  sr[7] = vsubq_s32(right[0], right[7]);
-
-  // Copy values 8-15 as we're storing in-place
-  inl[0] = left[8];
-  inr[0] = right[8];
-  inl[1] = left[9];
-  inr[1] = right[9];
-  inl[2] = left[10];
-  inr[2] = right[10];
-  inl[3] = left[11];
-  inr[3] = right[11];
-  inl[4] = left[12];
-  inr[4] = right[12];
-  inl[5] = left[13];
-  inr[5] = right[13];
-  inl[6] = left[14];
-  inr[6] = right[14];
-  inl[7] = left[15];
-  inr[7] = right[15];
-
-  // fdct4(step, step);
-  xl[0] = vaddq_s32(sl[0], sl[3]);
-  xr[0] = vaddq_s32(sr[0], sr[3]);
-  xl[1] = vaddq_s32(sl[1], sl[2]);
-  xr[1] = vaddq_s32(sr[1], sr[2]);
-  xl[2] = vsubq_s32(sl[1], sl[2]);
-  xr[2] = vsubq_s32(sr[1], sr[2]);
-  xl[3] = vsubq_s32(sl[0], sl[3]);
-  xr[3] = vsubq_s32(sr[0], sr[3]);
-
-  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
-  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
-                               &left[0], &right[0], &left[8], &right[8]);
-
-  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
-  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
-                                     cospi_24_64, &left[4], &right[4],
-                                     &left[12], &right[12]);
-
-  //  Stage 2
-  // Re-using source s5/s6
-  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
-  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
-                               &sr[6], &sl[5], &sr[5]);
-
-  //  Stage 3
-  xl[0] = vaddq_s32(sl[4], sl[5]);
-  xr[0] = vaddq_s32(sr[4], sr[5]);
-  xl[1] = vsubq_s32(sl[4], sl[5]);
-  xr[1] = vsubq_s32(sr[4], sr[5]);
-  xl[2] = vsubq_s32(sl[7], sl[6]);
-  xr[2] = vsubq_s32(sr[7], sr[6]);
-  xl[3] = vaddq_s32(sl[7], sl[6]);
-  xr[3] = vaddq_s32(sr[7], sr[6]);
-
-  // Stage 4
-  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
-  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
-                                     cospi_28_64, &left[2], &right[2],
-                                     &left[14], &right[14]);
-  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
-  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
-                                     cospi_12_64, &left[10], &right[10],
-                                     &left[6], &right[6]);
-
-  // step 2
-  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
-  // That file distinguished between "in_high" and "step1" but the only
-  // difference is that "in_high" is the first 8 values and "step 1" is the
-  // second. Here, since they are all in one array, "step1" values are += 8.
-
-  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
-  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
-  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
-  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
-                               &sl[5], &sr[5], &sl[2], &sr[2]);
-  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
-                               &sl[4], &sr[4], &sl[3], &sr[3]);
-
-  // step 3
-  sl[0] = vaddq_s32(inl[0], sl[3]);
-  sr[0] = vaddq_s32(inr[0], sr[3]);
-  sl[1] = vaddq_s32(inl[1], sl[2]);
-  sr[1] = vaddq_s32(inr[1], sr[2]);
-  xl[0] = vsubq_s32(inl[1], sl[2]);
-  xr[0] = vsubq_s32(inr[1], sr[2]);
-  xl[1] = vsubq_s32(inl[0], sl[3]);
-  xr[1] = vsubq_s32(inr[0], sr[3]);
-  xl[2] = vsubq_s32(inl[7], sl[4]);
-  xr[2] = vsubq_s32(inr[7], sr[4]);
-  xl[3] = vsubq_s32(inl[6], sl[5]);
-  xr[3] = vsubq_s32(inr[6], sr[5]);
-  sl[6] = vaddq_s32(inl[6], sl[5]);
-  sr[6] = vaddq_s32(inr[6], sr[5]);
-  sl[7] = vaddq_s32(inl[7], sl[4]);
-  sr[7] = vaddq_s32(inr[7], sr[4]);
-
-  // step 4
-  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
-  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
-  // * cospi_8_64)
-  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
-                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
-                                     &sr[1]);
-  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
-  // cospi_24_64)
-  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
-                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
-                                     &sr[5]);
-
-  // step 5
-  stepl[0] = vaddq_s32(sl[0], sl[1]);
-  stepr[0] = vaddq_s32(sr[0], sr[1]);
-  stepl[1] = vsubq_s32(sl[0], sl[1]);
-  stepr[1] = vsubq_s32(sr[0], sr[1]);
-  stepl[2] = vaddq_s32(xl[1], sl[2]);
-  stepr[2] = vaddq_s32(xr[1], sr[2]);
-  stepl[3] = vsubq_s32(xl[1], sl[2]);
-  stepr[3] = vsubq_s32(xr[1], sr[2]);
-  stepl[4] = vsubq_s32(xl[2], sl[5]);
-  stepr[4] = vsubq_s32(xr[2], sr[5]);
-  stepl[5] = vaddq_s32(xl[2], sl[5]);
-  stepr[5] = vaddq_s32(xr[2], sr[5]);
-  stepl[6] = vsubq_s32(sl[7], sl[6]);
-  stepr[6] = vsubq_s32(sr[7], sr[6]);
-  stepl[7] = vaddq_s32(sl[7], sl[6]);
-  stepr[7] = vaddq_s32(sr[7], sr[6]);
-
-  // step 6
-  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
-  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
-  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
-                                     cospi_18_64, cospi_14_64, &left[9],
-                                     &right[9], &left[7], &right[7]);
-  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
-  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
-  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
-                                     cospi_2_64, cospi_30_64, &left[1],
-                                     &right[1], &left[15], &right[15]);
-  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
-  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
-  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
-                                     cospi_26_64, cospi_6_64, &left[13],
-                                     &right[13], &left[3], &right[3]);
-  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
-  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
-  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
-                                     cospi_10_64, cospi_22_64, &left[5],
-                                     &right[5], &left[11], &right[11]);
-}
-
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h
index d8fa600448..cc65157430 100644
--- a/vpx_dsp/arm/fdct8x8_neon.h
+++ b/vpx_dsp/arm/fdct8x8_neon.h
@@ -293,88 +293,14 @@ static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
 
 static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
                                                  int32x4_t *right) {
-  int32x4x2_t out[8];
   vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
-
-  out[0].val[0] = left[0];
-  out[0].val[1] = right[0];
-  out[1].val[0] = left[1];
-  out[1].val[1] = right[1];
-  out[2].val[0] = left[2];
-  out[2].val[1] = right[2];
-  out[3].val[0] = left[3];
-  out[3].val[1] = right[3];
-  out[4].val[0] = left[4];
-  out[4].val[1] = right[4];
-  out[5].val[0] = left[5];
-  out[5].val[1] = right[5];
-  out[6].val[0] = left[6];
-  out[6].val[1] = right[6];
-  out[7].val[0] = left[7];
-  out[7].val[1] = right[7];
-
-  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
-                    &out[6], &out[7]);
-
-  left[0] = out[0].val[0];
-  right[0] = out[0].val[1];
-  left[1] = out[1].val[0];
-  right[1] = out[1].val[1];
-  left[2] = out[2].val[0];
-  right[2] = out[2].val[1];
-  left[3] = out[3].val[0];
-  right[3] = out[3].val[1];
-  left[4] = out[4].val[0];
-  right[4] = out[4].val[1];
-  left[5] = out[5].val[0];
-  right[5] = out[5].val[1];
-  left[6] = out[6].val[0];
-  right[6] = out[6].val[1];
-  left[7] = out[7].val[0];
-  right[7] = out[7].val[1];
+  transpose_s32_8x8_2(left, right, left, right);
 }
 
 static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
                                                  int32x4_t *right) {
-  int32x4x2_t out[8];
   vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
-
-  out[0].val[0] = left[0];
-  out[0].val[1] = right[0];
-  out[1].val[0] = left[1];
-  out[1].val[1] = right[1];
-  out[2].val[0] = left[2];
-  out[2].val[1] = right[2];
-  out[3].val[0] = left[3];
-  out[3].val[1] = right[3];
-  out[4].val[0] = left[4];
-  out[4].val[1] = right[4];
-  out[5].val[0] = left[5];
-  out[5].val[1] = right[5];
-  out[6].val[0] = left[6];
-  out[6].val[1] = right[6];
-  out[7].val[0] = left[7];
-  out[7].val[1] = right[7];
-
-  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
-                    &out[6], &out[7]);
-
-  left[0] = out[0].val[0];
-  right[0] = out[0].val[1];
-  left[1] = out[1].val[0];
-  right[1] = out[1].val[1];
-  left[2] = out[2].val[0];
-  right[2] = out[2].val[1];
-  left[3] = out[3].val[0];
-  right[3] = out[3].val[1];
-  left[4] = out[4].val[0];
-  right[4] = out[4].val[1];
-  left[5] = out[5].val[0];
-  right[5] = out[5].val[1];
-  left[6] = out[6].val[0];
-  right[6] = out[6].val[1];
-  left[7] = out[7].val[0];
-  right[7] = out[7].val[1];
+  transpose_s32_8x8_2(left, right, left, right);
 }
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 193594e3dc..16f5c5fc0e 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -177,6 +177,45 @@ static INLINE void butterfly_one_coeff_s32_fast(
   *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
 }
 
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_one_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac holds the following values:
+  // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c,
+  //     vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c
+  int64x2_t ac[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant);
+  ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant);
+  ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant);
+  ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant);
+
+  sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant);
+  sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant);
+  sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant);
+  sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant);
+  diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant);
+  diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant);
+  diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
 // fdct_round_shift(a * c1 +/- b * c2)
 // Variant that performs normal implementation on half vector
 // more accurate does 64-bit processing, takes and returns 32-bit values
@@ -205,6 +244,44 @@ static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
                       vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
 }
 
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 64-bit values
+// returns results without rounding
+static INLINE void butterfly_two_coeff_s32_s64_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/,
+    int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/,
+    int64x2_t *sub_hi /*[2]*/) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+
+  sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+}
+
 // fdct_round_shift(a * c1 +/- b * c2)
 // Variant that performs normal implementation on full vector
 // more accurate does 64-bit processing, takes and returns 32-bit values
@@ -420,4 +497,46 @@ static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
   return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
 }
 
+static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/,
+                                             const int64x2_t *b /*[2]*/) {
+  int64x2_t result[2];
+  result[0] = vaddq_s64(a[0], b[0]);
+  result[1] = vaddq_s64(a[1], b[1]);
+  return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+                      vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/,
+                                             const int64x2_t *b /*[2]*/) {
+  int64x2_t result[2];
+  result[0] = vsubq_s64(a[0], b[0]);
+  result[1] = vsubq_s64(a[1], b[1]);
+  return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+                      vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a,
+                                           const int32x4_t b) {
+  int64x2_t a64[2], b64[2], result[2];
+  a64[0] = vmovl_s32(vget_low_s32(a));
+  a64[1] = vmovl_s32(vget_high_s32(a));
+  b64[0] = vmovl_s32(vget_low_s32(b));
+  b64[1] = vmovl_s32(vget_high_s32(b));
+  result[0] = vaddq_s64(a64[0], b64[0]);
+  result[1] = vaddq_s64(a64[1], b64[1]);
+  return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a,
+                                           const int32x4_t b) {
+  int64x2_t a64[2], b64[2], result[2];
+  a64[0] = vmovl_s32(vget_low_s32(a));
+  a64[1] = vmovl_s32(vget_high_s32(a));
+  b64[0] = vmovl_s32(vget_low_s32(b));
+  b64[1] = vmovl_s32(vget_high_s32(b));
+  result[0] = vsubq_s64(a64[0], b64[0]);
+  result[1] = vsubq_s64(a64[1], b64[1]);
+  return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 41d44f2b1f..6c0bd08f77 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -866,6 +866,68 @@ static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
   out_right[7] = out[7].val[1];
 }
 
+static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1,
+                                       int32x4_t *left2, int32x4_t *right2) {
+  int32x4_t tl[16], tr[16];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  tl[0] = left1[8];
+  tl[1] = left1[9];
+  tl[2] = left1[10];
+  tl[3] = left1[11];
+  tl[4] = left1[12];
+  tl[5] = left1[13];
+  tl[6] = left1[14];
+  tl[7] = left1[15];
+  tr[0] = right1[8];
+  tr[1] = right1[9];
+  tr[2] = right1[10];
+  tr[3] = right1[11];
+  tr[4] = right1[12];
+  tr[5] = right1[13];
+  tr[6] = right1[14];
+  tr[7] = right1[15];
+
+  left1[8] = left2[0];
+  left1[9] = left2[1];
+  left1[10] = left2[2];
+  left1[11] = left2[3];
+  left1[12] = left2[4];
+  left1[13] = left2[5];
+  left1[14] = left2[6];
+  left1[15] = left2[7];
+  right1[8] = right2[0];
+  right1[9] = right2[1];
+  right1[10] = right2[2];
+  right1[11] = right2[3];
+  right1[12] = right2[4];
+  right1[13] = right2[5];
+  right1[14] = right2[6];
+  right1[15] = right2[7];
+
+  left2[0] = tl[0];
+  left2[1] = tl[1];
+  left2[2] = tl[2];
+  left2[3] = tl[3];
+  left2[4] = tl[4];
+  left2[5] = tl[5];
+  left2[6] = tl[6];
+  left2[7] = tl[7];
+  right2[0] = tr[0];
+  right2[1] = tr[1];
+  right2[2] = tr[2];
+  right2[3] = tr[3];
+  right2[4] = tr[4];
+  right2[5] = tr[5];
+  right2[6] = tr[6];
+  right2[7] = tr[7];
+
+  transpose_s32_8x8_2(left1, right1, left1, right1);
+  transpose_s32_8x8_2(left2, right2, left2, right2);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8);
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,

From 7fed9187c4fb0c97dd2df9b165f640baad001fdc Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 24 Jan 2023 14:27:14 +0000
Subject: [PATCH 503/926] Refactor Neon implementation of SAD functions

Refactor and optimize the Neon implementation of SAD functions -
effectively backporting these libaom changes[1,2,3].

[1] https://aomedia-review.googlesource.com/c/aom/+/161921
[2] https://aomedia-review.googlesource.com/c/aom/+/161923
[3] https://aomedia-review.googlesource.com/c/aom/+/166963

Change-Id: I2d72fd0f27d61a3e31a78acd33172e2afb044cb8
---
 vpx_dsp/arm/sad_neon.c | 1011 ++++++++++++++++++----------------------
 1 file changed, 462 insertions(+), 549 deletions(-)

diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index ad575d4aae..7336edb694 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -17,635 +17,548 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride) {
-  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
 #if defined(__ARM_FEATURE_DOTPROD)
-  const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
-  const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
-  return horizontal_add_uint32x4(dp);
-#else
-  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
-  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-  return horizontal_add_uint16x8(abs);
-#endif
-}
 
-uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             const uint8_t *second_pred) {
-  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-  const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
-  const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
-#if defined(__ARM_FEATURE_DOTPROD)
-  const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
-  const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
-  return horizontal_add_uint32x4(prod);
-#else
-  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
-  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
-  return horizontal_add_uint16x8(abs);
-#endif
-}
+static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
-uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride) {
-#if defined(__ARM_FEATURE_DOTPROD)
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-  const uint8x16_t src2_u8 =
-      load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
-  const uint8x16_t ref2_u8 =
-      load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
-  const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8);
-  const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8);
-  prod = vdotq_u32(prod, sad1_u8, ones);
-  prod = vdotq_u32(prod, sad2_u8, ones);
-  return horizontal_add_uint32x4(prod);
-#else
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-  for (i = 0; i < 8; i += 4) {
-    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-  }
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
 
-  return horizontal_add_uint16x8(abs);
-#endif
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
 }
 
-uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             const uint8_t *second_pred) {
-#if defined(__ARM_FEATURE_DOTPROD)
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-  const uint8x16_t src2_u8 =
-      load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
-  const uint8x16_t ref2_u8 =
-      load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
-  const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred);
-  const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16);
-  const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8);
-  const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8);
-  const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1);
-  const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2);
-  prod = vdotq_u32(prod, sad1_u8, ones);
-  prod = vdotq_u32(prod, sad2_u8, ones);
-  return horizontal_add_uint32x4(prod);
-#else
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-  for (i = 0; i < 8; i += 4) {
-    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
-    const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-    second_pred += 16;
-    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
-  }
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
 
-  return horizontal_add_uint16x8(abs);
-#endif
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
 }
 
-#if defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride,
-                               const int height) {
-  int i;
-  uint32x2_t prod = vdup_n_u32(0);
-  const uint8x8_t ones = vdup_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8);
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    prod = vdot_u32(prod, sad_u8, ones);
-  }
-  return prod;
-}
 
-static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_ptr, int ref_stride,
-                                   const uint8_t *second_pred,
-                                   const int height) {
-  int i;
-  uint32x2_t prod = vdup_n_u32(0);
-  const uint8x8_t ones = vdup_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t c_u8 = vld1_u8(second_pred);
-    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
-    const uint8x8_t sad_u8 = vabd_u8(a_u8, avg);
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 8;
-    prod = vdot_u32(prod, sad_u8, ones);
-  }
-  return prod;
-}
+  } while (--i != 0);
 
-#define SAD8XN(n)                                                            \
-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                               const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x2_t prod =                                                  \
-        sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint32x2(prod);                                    \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                   const uint8_t *ref_ptr, int ref_stride,   \
-                                   const uint8_t *second_pred) {             \
-    const uint32x2_t prod =                                                  \
-        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint32x2(prod);                                    \
-  }
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
 
 #else  // !defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride,
-                               const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32;
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+    uint8x16_t diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    diff2 = vabdq_u8(s2, r2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    diff3 = vabdq_u8(s3, r3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs = vabal_u8(abs, a_u8, b_u8);
-  }
-  return abs;
+  } while (--i != 0);
+
+  sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_uint32x4(sum_u32);
 }
 
-static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_ptr, int ref_stride,
-                                   const uint8_t *second_pred,
-                                   const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t c_u8 = vld1_u8(second_pred);
-    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t diff0 = vabdq_u8(s0, r0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t diff1 = vabdq_u8(s1, r1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 8;
-    abs = vabal_u8(abs, a_u8, avg);
-  }
-  return abs;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
 }
 
-#define SAD8XN(n)                                                              \
-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
-                               const uint8_t *ref_ptr, int ref_stride) {       \
-    const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
-    return horizontal_add_uint16x8(abs);                                       \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
-                                   const uint8_t *ref_ptr, int ref_stride,     \
-                                   const uint8_t *second_pred) {               \
-    const uint16x8_t abs =                                                     \
-        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
-    return horizontal_add_uint16x8(abs);                                       \
-  }
-#endif  // defined(__ARM_FEATURE_DOTPROD)
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
 
-SAD8XN(4)
-SAD8XN(8)
-SAD8XN(16)
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sum = vpadalq_u8(sum, diff);
 
-#if defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t src_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t ref_u8 = vld1q_u8(ref_ptr);
-    const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    prod = vdotq_u32(prod, sad_u8, ones);
-  }
-  return prod;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
-    const uint8x16_t c_u8 = vld1q_u8(second_pred);
-    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
-    const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg);
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+
+    sum = vabal_u8(sum, s, r);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 16;
-    prod = vdotq_u32(prod, sad_u8, ones);
-  }
-  return prod;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-#define SAD16XN(n)                                                            \
-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x4_t prod =                                                   \
-        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint32x4(prod);                                     \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint32x4_t prod =                                                   \
-        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint32x4(prod);                                     \
-  }
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint32x2_t s, r;
+    uint32_t s0, s1, r0, r1;
+
+    memcpy(&s0, src_ptr, 4);
+    memcpy(&r0, ref_ptr, 4);
+    s = vdup_n_u32(s0);
+    r = vdup_n_u32(r0);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
-    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
-  }
-  return abs;
-}
 
-static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
-    const uint8x16_t c_u8 = vld1q_u8(second_pred);
-    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
+    memcpy(&s1, src_ptr, 4);
+    memcpy(&r1, ref_ptr, 4);
+    s = vset_lane_u32(s1, s, 1);
+    r = vset_lane_u32(r1, r, 1);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 16;
-    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
-    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
-  }
-  return abs;
+
+    sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r));
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-#define SAD16XN(n)                                                            \
-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint16x8_t abs =                                                    \
-        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint16x8(abs);                                      \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint16x8_t abs =                                                    \
-        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint16x8(abs);                                      \
+#define SAD_WXH_NEON(w, h)                                                   \
+  unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride) { \
+    return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
   }
-#endif  // defined(__ARM_FEATURE_DOTPROD)
 
-SAD16XN(8)
-SAD16XN(16)
-SAD16XN(32)
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
 
 #if defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo);
-    const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi);
+
+static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    prod = vdotq_u32(prod, sad_lo_u8, ones);
-    prod = vdotq_u32(prod, sad_hi_u8, ones);
-  }
-  return prod;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                         second_pred);
 }
 
-static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t c_lo = vld1q_u8(second_pred);
-    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
-    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
-    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
-    const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo);
-    const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi);
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                         second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 32;
-    prod = vdotq_u32(prod, sad_lo_u8, ones);
-    prod = vdotq_u32(prod, sad_hi_u8, ones);
-  }
-  return prod;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
 }
 
-#define SAD32XN(n)                                                            \
-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x4_t prod =                                                   \
-        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint32x4(prod);                                     \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint32x4_t prod =                                                   \
-        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint32x4(prod);                                     \
-  }
+#else  // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32;
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+    uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    p1 = vld1q_u8(second_pred + 16);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    p2 = vld1q_u8(second_pred + 32);
+    avg2 = vrhaddq_u8(r2, p2);
+    diff2 = vabdq_u8(s2, avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    p3 = vld1q_u8(second_pred + 48);
+    avg3 = vrhaddq_u8(r3, p3);
+    diff3 = vabdq_u8(s3, avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
 
-#else  // defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
-    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
-    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
-    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
-  }
-  return abs;
+    second_pred += 64;
+  } while (--i != 0);
+
+  sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_uint32x4(sum_u32);
 }
 
-static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t c_lo = vld1q_u8(second_pred);
-    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
-    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
-    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+    uint8x16_t diff0 = vabdq_u8(s0, avg0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+    uint8x16_t diff1 = vabdq_u8(s1, avg1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
     second_pred += 32;
-    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
-    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
-    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
-    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
-  }
-  return abs;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
 }
 
-#define SAD32XN(n)                                                            \
-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint16x8_t abs =                                                    \
-        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint16x8(abs);                                      \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint16x8_t abs =                                                    \
-        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint16x8(abs);                                      \
-  }
-#endif  // defined(__ARM_FEATURE_DOTPROD)
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
 
-SAD32XN(16)
-SAD32XN(32)
-SAD32XN(64)
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
+
+    uint8x16_t avg = vrhaddq_u8(r, p);
+    uint8x16_t diff = vabdq_u8(s, avg);
+    sum = vpadalq_u8(sum, diff);
 
-#if defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
-    const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0);
-    const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1);
-    const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2);
-    const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    prod = vdotq_u32(prod, sad_0_u8, ones);
-    prod = vdotq_u32(prod, sad_1_u8, ones);
-    prod = vdotq_u32(prod, sad_2_u8, ones);
-    prod = vdotq_u32(prod, sad_3_u8, ones);
-  }
-  return prod;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
-    const uint8x16_t c_0 = vld1q_u8(second_pred);
-    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
-    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
-    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
-    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
-    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
-    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
-    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
-    const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0);
-    const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1);
-    const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2);
-    const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3);
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 64;
-    prod = vdotq_u32(prod, sad_0_u8, ones);
-    prod = vdotq_u32(prod, sad_1_u8, ones);
-    prod = vdotq_u32(prod, sad_2_u8, ones);
-    prod = vdotq_u32(prod, sad_3_u8, ones);
-  }
-  return prod;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
-#else   // !defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs_0 = vdupq_n_u16(0);
-  uint16x8_t abs_1 = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint32x2_t s, r;
+    uint32_t s0, s1, r0, r1;
+    uint8x8_t p, avg;
+
+    memcpy(&s0, src_ptr, 4);
+    memcpy(&r0, ref_ptr, 4);
+    s = vdup_n_u32(s0);
+    r = vdup_n_u32(r0);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
-  }
-
-  {
-    const uint32x4_t sum = vpaddlq_u16(abs_0);
-    return vpadalq_u16(sum, abs_1);
-  }
-}
 
-static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs_0 = vdupq_n_u16(0);
-  uint16x8_t abs_1 = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
-    const uint8x16_t c_0 = vld1q_u8(second_pred);
-    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
-    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
-    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
-    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
-    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
-    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
-    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+    memcpy(&s1, src_ptr, 4);
+    memcpy(&r1, ref_ptr, 4);
+    s = vset_lane_u32(s1, s, 1);
+    r = vset_lane_u32(r1, r, 1);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 64;
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
-  }
 
-  {
-    const uint32x4_t sum = vpaddlq_u16(abs_0);
-    return vpadalq_u16(sum, abs_1);
-  }
+    p = vld1_u8(second_pred);
+    avg = vrhadd_u8(vreinterpret_u8_u32(r), p);
+
+    sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg);
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
-#endif  // defined(__ARM_FEATURE_DOTPROD)
 
-#define SAD64XN(n)                                                            \
-  uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x4_t abs =                                                    \
-        sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint32x4(abs);                                      \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint32x4_t abs =                                                    \
-        sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint32x4(abs);                                      \
+#define SAD_WXH_AVG_NEON(w, h)                                             \
+  uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       const uint8_t *second_pred) {       \
+    return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),      \
+                               second_pred);                               \
   }
 
-SAD64XN(32)
-SAD64XN(64)
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)

From db69ce6aea278bee88668fd9cc2af2e544516fdb Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 25 Jan 2023 19:25:12 -0500
Subject: [PATCH 504/926] Fix per frame qp for temporal layers

Also add tests with fixed temporal layering mode.

Change-Id: If516fe94e3fb7f5a745821d1788bfe6cf90edaac
---
 test/vp9_datarate_test.cc          | 66 +++++++++++++++++++++++++-----
 vp9/encoder/vp9_svc_layercontext.c |  4 ++
 2 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index eccb001071..7e91807492 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -148,14 +148,16 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
       if (video->frame() == 0) {
         encoder->Control(VP9E_SET_SVC, 1);
       }
-      vpx_svc_layer_id_t layer_id;
-      layer_id.spatial_layer_id = 0;
-      frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id_per_spatial[0] =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+        vpx_svc_layer_id_t layer_id;
+        frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
+        layer_id.spatial_layer_id = 0;
+        layer_id.temporal_layer_id =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        layer_id.temporal_layer_id_per_spatial[0] =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      }
     }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
@@ -830,25 +832,37 @@ class DatarateTestVP9FrameQp
                                   ::libvpx_test::Encoder *encoder) {
     set_cpu_used_ = 7;
     DatarateTestVP9::PreEncodeFrameHook(video, encoder);
-    ACMRandom rnd;
-    frame_qp_ = static_cast<int>(rnd.RandRange(64));
+    frame_qp_ = static_cast<int>(rnd_.RandRange(64));
     encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_);
     frame_++;
   }
 
   virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
     int qp = 0;
+    vpx_svc_layer_id_t layer_id;
     if (frame_ >= total_frame_) return;
     encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp);
     ASSERT_EQ(frame_qp_, qp);
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    temporal_layer_id_ = layer_id.temporal_layer_id;
+  }
+
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {
+    if (frame_ >= total_frame_) return;
+    ASSERT_TRUE(cfg_.temporal_layering_mode ==
+                    VP9E_TEMPORAL_LAYERING_MODE_0212 &&
+                temporal_layer_id_ == 2);
   }
 
  protected:
   int total_frame_;
 
  private:
+  ACMRandom rnd_;
   int frame_qp_;
   int frame_;
+  int temporal_layer_id_;
 };
 
 TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
@@ -868,7 +882,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) {
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -897,6 +911,36 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_error_resilient = 1;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 // Params: speed setting.
 class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 518c00b34a..7e9435fb5f 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -894,6 +894,10 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
     RATE_CONTROL *const lrc = &lc->rc;
     lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
     lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q);
+    if (cpi->fixed_qp_onepass) {
+      lrc->worst_quality = cpi->rc.worst_quality;
+      lrc->best_quality = cpi->rc.best_quality;
+    }
   }
 
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&

From 5e92d6d103e923a28cd56ee2c7efd6b48a0611b4 Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Thu, 26 Jan 2023 16:12:55 +0100
Subject: [PATCH 505/926] Refactor 8x8 16-bit Neon transpose functions

Refactor the Neon implementation of transpose_s16_8x8(q) and
transpose_u16_8x8 so that the final step compiles to 8 ZIP1/ZIP2
instructions as opposed to 8 EXT, MOV pairs. This change removes 8
instructions per call to transpose_s16_8x8(q), transpose_u16_8x8
where the result stays in registers for further processing - rather
than being stored to memory - like in vpx_hadamard_8x8_neon, for
example.

This is a backport of this libaom patch[1].
[1] https://aomedia-review.googlesource.com/c/aom/+/169426

Change-Id: Icef3e51d40efeca7008e1c4fc701bf39bd319c88
---
 vpx_dsp/arm/fdct16x16_neon.c |   6 +-
 vpx_dsp/arm/fdct32x32_neon.c |  64 ++++++++++----------
 vpx_dsp/arm/transpose_neon.h | 114 +++++++++++++++++++++++++----------
 3 files changed, 118 insertions(+), 66 deletions(-)

diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index a458ecaa41..0628acb750 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -47,8 +47,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
 
-  transpose_s16_8x8_new(&temp0[0], &temp2[0]);
-  transpose_s16_8x8_new(&temp1[0], &temp2[8]);
+  transpose_s16_8x8q(&temp0[0], &temp2[0]);
+  transpose_s16_8x8q(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3);
   vpx_fdct8x16_body(temp3, temp2);
@@ -62,7 +62,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
-  transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+  transpose_s16_8x8q(&temp0[8], &temp1[0]);
 
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index d6818d2ec6..a91730ce8b 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 41d44f2b1f..9d13132502 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -23,10 +23,17 @@
 // b0.val[1]: 04 05 06 07 20 21 22 23
 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
+#if defined(__aarch64__)
+  b0.val[0] = vreinterpretq_s16_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s16_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
                            vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
   return b0;
 }
 
@@ -57,10 +64,17 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
 
 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
   uint16x8x2_t b0;
+#if defined(__aarch64__)
+  b0.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
                            vreinterpret_u16_u32(vget_low_u32(a1)));
   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
                            vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
   return b0;
 }
 
@@ -569,37 +583,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
 }
 
 // Transpose 8x8 to a new location.
-static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
+static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
 }
 
 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
@@ -658,6 +708,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
+
   const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
   const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
   const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
@@ -729,6 +780,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
+
   const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
   const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
   const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);

From 5dd3d70a4f7256cfb48aa925e44c42d80abe93b1 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Thu, 26 Jan 2023 17:20:54 -0800
Subject: [PATCH 506/926] Add encoder component timing information

Change-Id: Iaa5b73a9593ecfd74b6426ed47d2b529ec7ae2b5
---
 configure                     |   1 +
 vp9/encoder/vp9_encodeframe.c |  37 +++++++++++++
 vp9/encoder/vp9_encoder.c     |  90 +++++++++++++++++++++++++++++-
 vp9/encoder/vp9_encoder.h     | 101 ++++++++++++++++++++++++++++++++++
 vp9/encoder/vp9_rdopt.c       |  37 ++++++++++++-
 5 files changed, 264 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index ae289f77b4..18f0ea798b 100755
--- a/configure
+++ b/configure
@@ -293,6 +293,7 @@ EXPERIMENT_LIST="
     emulate_hardware
     non_greedy_mv
     rate_ctrl
+    collect_component_timing
 "
 CONFIG_LIST="
     dependency_tracking
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1483ac069d..a22c00bd8f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1980,6 +1980,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t best_rd = INT64_MAX;
 
   vpx_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
@@ -2047,15 +2050,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
       if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
         vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
         vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                   bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
     } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
       vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                     bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
     }
   }
 
@@ -2078,6 +2093,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   ctx->rate = rd_cost->rate;
   ctx->dist = rd_cost->dist;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -4411,8 +4429,14 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
   if (should_encode_sb && pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_sb_time);
+#endif
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_sb_time);
+#endif
 #if CONFIG_RATE_CTRL
     if (oxcf->use_simple_encode_api) {
       // Store partition, motion vector of the superblock.
@@ -4539,8 +4563,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
                                 &x->min_partition_size, &x->max_partition_size);
       }
       td->pc_root->none.rdcost = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, rd_pick_partition_time);
+#endif
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rdc, dummy_rdc, td->pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, rd_pick_partition_time);
+#endif
     }
     (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
                                     sb_col_in_tile, num_sb_cols);
@@ -6283,7 +6314,13 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     if (cm->interp_filter == SWITCHABLE)
       cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_frame_internal_time);
+#endif
     encode_frame_internal(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_frame_internal_time);
+#endif
 
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b66fdc0bca..b9fc148d7b 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4415,6 +4415,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
                                        (cpi->twopass.gf_group.index == 1)
                                  : 0;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame: \n");
+#endif
   do {
     vpx_clear_system_state();
 
@@ -4802,6 +4805,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
 
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
       if (loop) restore_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    if (loop) printf("\n Recoding:");
+#endif
   } while (loop);
 
   rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth;
@@ -5549,8 +5555,14 @@ static void encode_frame_to_data_rate(
 #if !CONFIG_REALTIME_ONLY
 #if CONFIG_RATE_CTRL
     encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history);
-#else   // CONFIG_RATE_CTRL
+#else  // CONFIG_RATE_CTRL
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_with_recode_loop_time);
+#endif
     encode_with_recode_loop(cpi, size, dest);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_with_recode_loop_time);
+#endif
 #endif  // CONFIG_RATE_CTRL
 #endif  // !CONFIG_REALTIME_ONLY
   }
@@ -5609,13 +5621,25 @@ static void encode_frame_to_data_rate(
   cm->frame_to_show->render_width = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loopfilter_frame_time);
+#endif
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loopfilter_frame_time);
+#endif
 
   if (cpi->rc.use_post_encode_drop) save_coding_context(cpi);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, vp9_pack_bitstream_time);
+#endif
   // build the bitstream
   vp9_pack_bitstream(cpi, dest, size);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, vp9_pack_bitstream_time);
+#endif
 
   if (cpi->ext_ratectrl.ready) {
     const RefCntBuffer *coded_frame_buf =
@@ -7640,6 +7664,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   const int gf_group_index = cpi->twopass.gf_group.index;
   int i;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time);
+#endif
+
   if (is_one_pass_svc(cpi)) {
     vp9_one_pass_svc_start_layer(cpi);
   }
@@ -7704,9 +7732,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
         int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1);
         not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, vp9_temporal_filter_time);
+#endif
         // Produce the filtered ARF frame.
         vp9_temporal_filter(cpi, arf_src_index);
         vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, vp9_temporal_filter_time);
+#endif
 
         // for small bitrates segmentation overhead usually
         // eats all bitrate gain from enabling delta quantizers
@@ -7820,7 +7854,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #if !CONFIG_REALTIME_ONLY
   if ((oxcf->pass == 2) && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
     vp9_rc_get_second_pass_params(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
   } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
   }
@@ -7860,6 +7900,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 #endif  // CONFIG_NON_GREEDY_MV
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, setup_tpl_stats_time);
+#endif
   if (gf_group_index == 1 &&
       cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
       cpi->sf.enable_tpl_model) {
@@ -7867,6 +7910,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     vp9_estimate_qp_gop(cpi);
     setup_tpl_stats(cpi);
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, setup_tpl_stats_time);
+#endif
 
 #if CONFIG_BITSTREAM_DEBUG
   assert(cpi->oxcf.max_threads == 0 &&
@@ -7903,8 +7949,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    // Accumulate 2nd pass time in 2-pass case.
+    start_timing(cpi, Pass2Encode_time);
+#endif
     Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result);
     vp9_twopass_postencode_update(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, Pass2Encode_time);
+#endif
   } else if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
   } else {
@@ -8107,6 +8160,41 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #endif
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time);
+
+  // Print out timing information.
+  // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+  // show_existing_frame and lag-in-frames.
+  //  if (cpi->frame_component_time[0] > 100)
+  if (oxcf->pass == 2) {
+    uint64_t frame_total = 0, total = 0;
+    int i;
+
+    fprintf(stderr,
+            "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n",
+            cm->current_video_frame, get_frame_type_enum(cm->frame_type),
+            cm->show_frame, cm->base_qindex);
+    for (i = 0; i < kTimingComponents; i++) {
+      cpi->component_time[i] += cpi->frame_component_time[i];
+      // Use vp9_get_compressed_data_time (i = 0) as the total time.
+      if (i == 0) {
+        frame_total = cpi->frame_component_time[0];
+        total = cpi->component_time[0];
+      }
+      fprintf(stderr,
+              " %50s:  %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+              " us [%6.2f%%])\n",
+              get_component_name(i), cpi->frame_component_time[i],
+              (float)((float)cpi->frame_component_time[i] * 100.0 /
+                      (float)frame_total),
+              cpi->component_time[i],
+              (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+      cpi->frame_component_time[i] = 0;
+    }
+  }
+#endif
+
   if (is_one_pass_svc(cpi)) {
     if (cm->show_frame) {
       ++cpi->svc.spatial_layer_to_encode;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index cca8b53f8e..33a2844d3e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -659,6 +659,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; }
 static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; }
 #endif  // CONFIG_RATE_CTRL
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "vpx_ports/vpx_timer.h"
+// Adjust the following to add new components.
+typedef enum {
+  vp9_get_compressed_data_time,
+  vp9_temporal_filter_time,
+  vp9_rc_get_second_pass_params_time,
+  setup_tpl_stats_time,
+  Pass2Encode_time,
+
+  encode_with_recode_loop_time,
+  loopfilter_frame_time,
+  vp9_pack_bitstream_time,
+
+  encode_frame_internal_time,
+  rd_pick_partition_time,
+  rd_pick_sb_modes_time,
+  encode_sb_time,
+
+  vp9_rd_pick_inter_mode_sb_time,
+  vp9_rd_pick_inter_mode_sub8x8_time,
+
+  intra_mode_search_time,
+  handle_inter_mode_time,
+  single_motion_search_time,
+  joint_motion_search_time,
+  interp_filter_time,
+
+  kTimingComponents,
+} TIMING_COMPONENT;
+
+static INLINE char const *get_component_name(int index) {
+  switch (index) {
+    case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
+    case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
+    case vp9_rc_get_second_pass_params_time:
+      return "vp9_rc_get_second_pass_params_time";
+    case setup_tpl_stats_time: return "setup_tpl_stats_time";
+    case Pass2Encode_time: return "Pass2Encode_time";
+
+    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case loopfilter_frame_time: return "loopfilter_frame_time";
+    case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
+
+    case encode_frame_internal_time: return "encode_frame_internal_time";
+    case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+    case encode_sb_time: return "encode_sb_time";
+
+    case vp9_rd_pick_inter_mode_sb_time:
+      return "vp9_rd_pick_inter_mode_sb_time";
+    case vp9_rd_pick_inter_mode_sub8x8_time:
+      return "vp9_rd_pick_inter_mode_sub8x8_time";
+
+    case intra_mode_search_time: return "intra_mode_search_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case single_motion_search_time: return "single_motion_search_time";
+    case joint_motion_search_time: return "joint_motion_search_time";
+    case interp_filter_time: return "interp_filter_time";
+
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 typedef struct VP9_COMP {
   FRAME_INFO frame_info;
   QUANTS quants;
@@ -973,6 +1039,22 @@ typedef struct VP9_COMP {
   EXT_RATECTRL ext_ratectrl;
 
   int fixed_qp_onepass;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  /*!
+   * component_time[] are initialized to zero while encoder starts.
+   */
+  uint64_t component_time[kTimingComponents];
+  /*!
+   * Stores timing for individual components between calls of start_timing()
+   * and end_timing().
+   */
+  struct vpx_usec_timer component_timer[kTimingComponents];
+  /*!
+   * frame_component_time[] are initialized to zero at beginning of each frame.
+   */
+  uint64_t frame_component_time[kTimingComponents];
+#endif
 } VP9_COMP;
 
 #if CONFIG_RATE_CTRL
@@ -1392,6 +1474,25 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_mark(&cpi->component_timer[component]);
+  cpi->frame_component_time[component] +=
+      vpx_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+  switch (type) {
+    case 0: return "KEY_FRAME";
+    case 1: return "INTER_FRAME";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a464ce38f1..d9b031cdc8 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2832,8 +2832,14 @@ static int64_t handle_inter_mode(
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, joint_motion_search_time);
+#endif
         joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
                             single_newmv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, joint_motion_search_time);
+#endif
       } else {
         rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@@ -2845,7 +2851,13 @@ static int64_t handle_inter_mode(
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, single_motion_search_time);
+#endif
       single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, single_motion_search_time);
+#endif
       if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
 
       frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
@@ -2908,6 +2920,9 @@ static int64_t handle_inter_mode(
   intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
   if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, interp_filter_time);
+#endif
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
@@ -3005,6 +3020,9 @@ static int64_t handle_inter_mode(
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
     }
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, interp_filter_time);
+#endif
   // Set the appropriate filter
   mi->interp_filter =
       cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
@@ -3707,19 +3725,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
                       best_rd, recon);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
       if (rate_y == INT_MAX) continue;
 
       uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
                               [pd->subsampling_y];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
                              &skip_uv[uv_tx], &mode_uv[uv_tx]);
       }
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
       rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
@@ -3730,11 +3759,17 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, handle_inter_mode_time);
+#endif
       this_rd = handle_inter_mode(
           cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
           recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
           single_inter_filter, single_skippable, &total_sse, best_rd,
           &mask_filter, filter_cache);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, handle_inter_mode_time);
+#endif
       if (this_rd == INT64_MAX) continue;
 
       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);

From 8047e6f2b3b1c325fae106951c3aee747fde7884 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 27 Jan 2023 16:16:16 +0000
Subject: [PATCH 507/926] Refactor Neon implementation of SAD4D functions

Refactor and optimize the Neon implementation of SAD4D functions -
effectively backporting these libaom changes[1,2].

[1] https://aomedia-review.googlesource.com/c/aom/+/162181
[2] https://aomedia-review.googlesource.com/c/aom/+/162183

Change-Id: Icb04bd841d86f2d0e2596aa7ba86b74f8d2d360b
---
 vpx_dsp/arm/sad4d_neon.c | 881 +++++++++++++--------------------------
 vpx_dsp/arm/sum_neon.h   |  17 +
 2 files changed, 309 insertions(+), 589 deletions(-)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 5fc621aee1..5064770ee6 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -17,633 +17,336 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
-                                                 const void *const buf1) {
-  uint32_t a;
-  uint32x2_t aa;
-  memcpy(&a, buf0, 4);
-  aa = vdup_n_u32(a);
-  memcpy(&a, buf1, 4);
-  aa = vset_lane_u32(a, aa, 1);
-  return vreinterpret_u8_u32(aa);
-}
-
-static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
-                            const uint8_t *const ref_array[4],
-                            const int ref_stride, const int height,
-                            uint32_t sad_array[4]) {
-  int i;
-  uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
-#if !defined(__aarch64__)
-  uint16x4_t a[2];
-#endif
-  uint32x4_t r;
-
-  assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
-  assert(!(src_stride % sizeof(uint32_t)));
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vreinterpret_u8_u32(
-        vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
-    const uint8x8_t ref01 = load_unaligned_2_buffers(
-        ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
-    const uint8x8_t ref23 = load_unaligned_2_buffers(
-        ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
-    abs[0] = vabal_u8(abs[0], s, ref01);
-    abs[1] = vabal_u8(abs[1], s, ref23);
-  }
-
-#if defined(__aarch64__)
-  abs[0] = vpaddq_u16(abs[0], abs[1]);
-  r = vpaddlq_u16(abs[0]);
-#else
-  a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
-  a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
-  r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
-#endif
-  vst1q_u32(sad_array, r);
-}
-
-void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array);
-}
-
-void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
-static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4],
-                                          uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
-  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
-  const uint16x8_t b0 = vpaddq_u16(a0, a1);
-  const uint32x4_t r = vpaddlq_u16(b0);
-#else
-  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
-  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
-  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
-  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
-  const uint16x4_t b0 = vpadd_u16(a0, a1);
-  const uint16x4_t b1 = vpadd_u16(a2, a3);
-  const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
-#endif
-  vst1q_u32(sad_array, r);
-}
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
-#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
-
-// Can handle 1024 pixels' sad sum (such as 32x32)
-static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
-  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
-  const uint32x4_t b0 = vpaddlq_u16(a0);
-  const uint32x4_t b1 = vpaddlq_u16(a1);
-  const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
-  const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
-  const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
-  const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
-  const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
-  const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
-  const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
-  const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  vst1q_u32(sad_array, vcombine_u32(c0, c1));
-#endif
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
 }
 
-// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
-static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t b0 = vpaddq_u32(a0, a1);
-  const uint32x4_t b1 = vpaddq_u32(a2, a3);
-  const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
-  const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
-  const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
-  const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
-  const uint32x2_t c0 = vpadd_u32(b0, b1);
-  const uint32x2_t c1 = vpadd_u32(b2, b3);
-  vst1q_u32(sad_array, vcombine_u32(c0, c1));
-#endif
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint32x4_t res0, res1;
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]),
+                    vaddq_u32(sum_lo[1], sum_hi[1]));
+  res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]),
+                    vaddq_u32(sum_lo[3], sum_hi[3]));
+  vst1q_u32(res, vpaddq_u32(res0, res1));
 }
 
-// Can handle 4096 pixels' sad sum (such as 64x64)
-static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
-  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
-  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
-  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
-  const uint32x4_t b0 = vaddq_u32(a0, a1);
-  const uint32x4_t b1 = vaddq_u32(a2, a3);
-  const uint32x4_t b2 = vaddq_u32(a4, a5);
-  const uint32x4_t b3 = vaddq_u32(a6, a7);
-  const uint32x4_t c0 = vpaddq_u32(b0, b1);
-  const uint32x4_t c1 = vpaddq_u32(b2, b3);
-  const uint32x4_t r = vpaddq_u32(c0, c1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
-  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
-  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
-  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
-  const uint32x4_t b0 = vaddq_u32(a0, a1);
-  const uint32x4_t b1 = vaddq_u32(a2, a3);
-  const uint32x4_t b2 = vaddq_u32(a4, a5);
-  const uint32x4_t b3 = vaddq_u32(a6, a7);
-  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
-  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
-  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
-  const uint32x2_t d0 = vpadd_u32(c0, c1);
-  const uint32x2_t d1 = vpadd_u32(c2, c3);
-  vst1q_u32(sad_array, vcombine_u32(d0, d1));
-#endif
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint32x4_t res0, res1;
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]),
+                    vaddq_u32(sum_lo[1], sum_hi[1]));
+  res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]),
+                    vaddq_u32(sum_lo[3], sum_hi[3]));
+  vst1q_u32(res, vpaddq_u32(res0, res1));
 }
 
-#endif
-
-static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t sad_array[4], const int height) {
-  int i, j;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    for (j = 0; j < 4; ++j) {
-      const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
-      ref_loop[j] += ref_stride;
-      sum[j] = vabal_u8(sum[j], s, b_u8);
-    }
-  }
-
-  sad_512_pel_final_neon(sum, sad_array);
-}
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint32x4_t res0, res1;
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
 
-void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4);
-}
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
 
-void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
-}
+    i++;
+  } while (i < h);
 
-void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
+  res0 = vpaddq_u32(sum[0], sum[1]);
+  res1 = vpaddq_u32(sum[2], sum[3]);
+  vst1q_u32(res, vpaddq_u32(res0, res1));
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
-static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
-                              uint32x4_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref_ptr);
-  const uint8x16_t diff = vabdq_u8(src_ptr, r);
-  *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1));
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint16x8_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
 }
 
-static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride);
-    sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]);
-    sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]);
-    sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]);
-    sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]);
-  }
-
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  int h_tmp = h > 64 ? 64 : h;
+  int i = 0;
+  vst1q_u32(res, vdupq_n_u32(0));
+
+  do {
+    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+
+      s0 = vld1q_u8(src + i * src_stride);
+      sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+      sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+      sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+      sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+      s1 = vld1q_u8(src + i * src_stride + 16);
+      sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+      sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+      sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+      sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+      s2 = vld1q_u8(src + i * src_stride + 32);
+      sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+      sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+      sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+      sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+      s3 = vld1q_u8(src + i * src_stride + 48);
+      sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+      sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+      sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+      sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+      i++;
+    } while (i < h_tmp);
+
+    res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
+    res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
+    res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
+    res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
+
+    h_tmp += 64;
+  } while (i < h);
 }
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
-                              uint16x8_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref_ptr);
-  *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
-  *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
+  res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
+  res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
+  res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
 }
 
-static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    /* Manual unrolling here stops the compiler from getting confused. */
-    sad16_neon(ref_loop[0], s, &sum[0]);
-    ref_loop[0] += ref_stride;
-    sad16_neon(ref_loop[1], s, &sum[1]);
-    ref_loop[1] += ref_stride;
-    sad16_neon(ref_loop[2], s, &sum[2]);
-    ref_loop[2] += ref_stride;
-    sad16_neon(ref_loop[3], s, &sum[3]);
-    ref_loop[3] += ref_stride;
-  }
-
-  sad_512_pel_final_neon(sum, sad_array);
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+    i++;
+  } while (i < h);
+
+  res[0] = horizontal_add_uint16x8(sum[0]);
+  res[1] = horizontal_add_uint16x8(sum[1]);
+  res[2] = horizontal_add_uint16x8(sum[2]);
+  res[3] = horizontal_add_uint16x8(sum[3]);
 }
 
 #endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
-void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
-}
-
-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
-}
-
-void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  for (i = 0; i < height; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
-
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
-}
-
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
-}
-
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
-}
-
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             const int height, uint16x8_t *const sum) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-
-  sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
+static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
+                             uint16x8_t *const sad_sum) {
+  uint8x8_t abs_diff = vabd_u8(src, ref);
+  *sad_sum = vaddw_u8(*sad_sum, abs_diff);
 }
 
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
-  sad_512_pel_final_neon(sum, sad_array);
-}
-
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
-  sad_1024_pel_final_neon(sum, sad_array);
-}
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
 
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
-  sad_2048_pel_final_neon(sum, sad_array);
+  int i = 0;
+  do {
+    const uint8x8_t s = vld1_u8(src + i * src_stride);
+    sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
+
+    i++;
+  } while (i < h);
+
+  res[0] = horizontal_add_uint16x8(sum[0]);
+  res[1] = horizontal_add_uint16x8(sum[1]);
+  res[2] = horizontal_add_uint16x8(sum[2]);
+  res[3] = horizontal_add_uint16x8(sum[3]);
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  for (i = 0; i < 32; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
 
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
+  int i = 0;
+  do {
+    uint32x2_t s, r0, r1, r2, r3;
+    uint32_t s_lo, s_hi, r0_lo, r0_hi, r1_lo, r1_hi, r2_lo, r2_hi, r3_lo, r3_hi;
+
+    memcpy(&s_lo, src + i * src_stride, 4);
+    memcpy(&r0_lo, ref[0] + i * ref_stride, 4);
+    memcpy(&r1_lo, ref[1] + i * ref_stride, 4);
+    memcpy(&r2_lo, ref[2] + i * ref_stride, 4);
+    memcpy(&r3_lo, ref[3] + i * ref_stride, 4);
+    s = vdup_n_u32(s_lo);
+    r0 = vdup_n_u32(r0_lo);
+    r1 = vdup_n_u32(r1_lo);
+    r2 = vdup_n_u32(r2_lo);
+    r3 = vdup_n_u32(r3_lo);
+
+    memcpy(&s_hi, src + (i + 1) * src_stride, 4);
+    memcpy(&r0_hi, ref[0] + (i + 1) * ref_stride, 4);
+    memcpy(&r1_hi, ref[1] + (i + 1) * ref_stride, 4);
+    memcpy(&r2_hi, ref[2] + (i + 1) * ref_stride, 4);
+    memcpy(&r3_hi, ref[3] + (i + 1) * ref_stride, 4);
+    s = vset_lane_u32(s_hi, s, 1);
+    r0 = vset_lane_u32(r0_hi, r0, 1);
+    r1 = vset_lane_u32(r1_hi, r1, 1);
+    r2 = vset_lane_u32(r2_hi, r2, 1);
+    r3 = vset_lane_u32(r3_hi, r3, 1);
+
+    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r0), &sum[0]);
+    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r1), &sum[1]);
+    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r2), &sum[2]);
+    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r3), &sum[3]);
+
+    i += 2;
+  } while (i < h);
+
+  res[0] = horizontal_add_uint16x8(sum[0]);
+  res[1] = horizontal_add_uint16x8(sum[1]);
+  res[2] = horizontal_add_uint16x8(sum[2]);
+  res[3] = horizontal_add_uint16x8(sum[3]);
 }
 
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  uint32x4_t r0, r1, r2, r3;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  for (i = 0; i < 64; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
+#define SAD_WXH_4D_NEON(w, h)                                                  \
+  void vpx_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
   }
 
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  r2 = vpaddq_u32(sum[4], sum[5]);
-  r3 = vpaddq_u32(sum[6], sum[7]);
-  r0 = vpaddq_u32(r0, r1);
-  r1 = vpaddq_u32(r2, r3);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
 
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
 
-  for (i = 0; i < 32; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
 
-  sad_2048_pel_final_neon(sum, sad_array);
-}
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
 
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0), vdupq_n_u16(0) };
-
-  for (i = 0; i < 64; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-
-  sad_4096_pel_final_neon(sum, sad_array);
-}
-
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#undef SAD_WXH_4D_NEON
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 9a7c424e8e..5f20f9d99a 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -40,6 +40,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
 #endif
 }
 
+static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
 static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
 #if defined(__aarch64__)
   return vaddv_s32(a);

From a94cdd57ffd95ee7beb48d2794dae538f25da46c Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 30 Jan 2023 11:51:58 -0800
Subject: [PATCH 508/926] Fix unsigned integer overflow in sse computation

Basically port the fix from libaom:
https://aomedia-review.googlesource.com/c/aom/+/169361

Change-Id: Id06a5db91372037832399200ded75d514e096726
---
 vpx_dsp/psnr.c | 67 ++++++++++++++++++--------------------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
index 48bac04508..f0d4e927ae 100644
--- a/vpx_dsp/psnr.c
+++ b/vpx_dsp/psnr.c
@@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
  * and highbd_8_variance(). It should not.
  */
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int w, int h) {
   int i, j;
-
-  *sum = 0;
-  *sse = 0;
+  int64_t sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
 
     a += a_stride;
     b += b_stride;
   }
+
+  return sse;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int w,
+                                    int h) {
   int i, j;
+  int64_t sse = 0;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
     a += a_stride;
     b += b_stride;
   }
-}
 
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
-                            &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
+  return sse;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   const int dw = width % 16;
   const int dh = height % 16;
   int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
   int x, y;
 
   if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                             dw, height);
   }
 
   if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
+    total_sse +=
+        encoder_sse(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride, width - dw, dh);
   }
 
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
+    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
       vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
@@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   int x, y;
   const int dw = width % 16;
   const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
   if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
-                              b_stride, dw, height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
+                                      b_stride, dw, height);
   }
   if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
+                                      &b[(height - dh) * b_stride], b_stride,
+                                      width - dw, dh);
   }
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
+    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
       vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;

From 472c839c9f6e88e976faebdc05848e64e7f3945d Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 31 Jan 2023 13:32:33 +0000
Subject: [PATCH 509/926] Use load_unaligned mem_neon.h helpers in SAD and
 SAD4D

Use the load_unaligned helper functions in mem_neon.h to load strided
sequences of 4 bytes where alignment is not guaranteed in the Neon
SAD and SAD4D paths.

Change-Id: I941d226ef94fd7a633b09fc92165a00ba68a1501
---
 vpx_dsp/arm/sad4d_neon.c | 39 +++++++++-----------------------
 vpx_dsp/arm/sad_neon.c   | 48 ++++++++++------------------------------
 2 files changed, 22 insertions(+), 65 deletions(-)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 5064770ee6..85f6c1e5b1 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -285,35 +285,16 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
 
   int i = 0;
   do {
-    uint32x2_t s, r0, r1, r2, r3;
-    uint32_t s_lo, s_hi, r0_lo, r0_hi, r1_lo, r1_hi, r2_lo, r2_hi, r3_lo, r3_hi;
-
-    memcpy(&s_lo, src + i * src_stride, 4);
-    memcpy(&r0_lo, ref[0] + i * ref_stride, 4);
-    memcpy(&r1_lo, ref[1] + i * ref_stride, 4);
-    memcpy(&r2_lo, ref[2] + i * ref_stride, 4);
-    memcpy(&r3_lo, ref[3] + i * ref_stride, 4);
-    s = vdup_n_u32(s_lo);
-    r0 = vdup_n_u32(r0_lo);
-    r1 = vdup_n_u32(r1_lo);
-    r2 = vdup_n_u32(r2_lo);
-    r3 = vdup_n_u32(r3_lo);
-
-    memcpy(&s_hi, src + (i + 1) * src_stride, 4);
-    memcpy(&r0_hi, ref[0] + (i + 1) * ref_stride, 4);
-    memcpy(&r1_hi, ref[1] + (i + 1) * ref_stride, 4);
-    memcpy(&r2_hi, ref[2] + (i + 1) * ref_stride, 4);
-    memcpy(&r3_hi, ref[3] + (i + 1) * ref_stride, 4);
-    s = vset_lane_u32(s_hi, s, 1);
-    r0 = vset_lane_u32(r0_hi, r0, 1);
-    r1 = vset_lane_u32(r1_hi, r1, 1);
-    r2 = vset_lane_u32(r2_hi, r2, 1);
-    r3 = vset_lane_u32(r3_hi, r3, 1);
-
-    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r0), &sum[0]);
-    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r1), &sum[1]);
-    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r2), &sum[2]);
-    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r3), &sum[3]);
+    uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+    uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+    uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+    uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+    uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
+
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+    sad8_neon(s, r3, &sum[3]);
 
     i += 2;
   } while (i < h);
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 7336edb694..9382b80626 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -214,24 +214,13 @@ static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
 
   int i = h / 2;
   do {
-    uint32x2_t s, r;
-    uint32_t s0, s1, r0, r1;
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
 
-    memcpy(&s0, src_ptr, 4);
-    memcpy(&r0, ref_ptr, 4);
-    s = vdup_n_u32(s0);
-    r = vdup_n_u32(r0);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    memcpy(&s1, src_ptr, 4);
-    memcpy(&r1, ref_ptr, 4);
-    s = vset_lane_u32(s1, s, 1);
-    r = vset_lane_u32(r1, r, 1);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
+    sum = vabal_u8(sum, s, r);
 
-    sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r));
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
   } while (--i != 0);
 
   return horizontal_add_uint16x8(sum);
@@ -509,28 +498,15 @@ static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
 
   int i = h / 2;
   do {
-    uint32x2_t s, r;
-    uint32_t s0, s1, r0, r1;
-    uint8x8_t p, avg;
-
-    memcpy(&s0, src_ptr, 4);
-    memcpy(&r0, ref_ptr, 4);
-    s = vdup_n_u32(s0);
-    r = vdup_n_u32(r0);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    memcpy(&s1, src_ptr, 4);
-    memcpy(&r1, ref_ptr, 4);
-    s = vset_lane_u32(s1, s, 1);
-    r = vset_lane_u32(r1, r, 1);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
 
-    p = vld1_u8(second_pred);
-    avg = vrhadd_u8(vreinterpret_u8_u32(r), p);
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
 
-    sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg);
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
     second_pred += 8;
   } while (--i != 0);
 

From 3f109f786a91a40bd6feb87ce133f351afeff63b Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 31 Jan 2023 12:16:38 -0500
Subject: [PATCH 510/926] Update CHANGELOG

Bug: webm:1780
Change-Id: I3ab4729bff1d27ef7127ef26e780a469e9278c21
---
 CHANGELOG | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 4f5dcbd44e..3fb2d19bbe 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,39 @@
+2023-01-31 v1.13.0 "Ugly Duckling"
+  This release includes more Neon and AVX2 optimizations, adds a new codec
+  control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes
+  numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP.
+
+    GoogleTest is upgraded to v1.12.1.
+
+    .clang-format is upgraded to clang-format-11.
+
+    VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the
+    feature of using external rate control models for vp9.
+
+  - Enhancement:
+    Numerous improvements on Neon optimizations.
+    Numerous improvements on AVX2 optimizations.
+    Additional ARM targets added for Visual Studio.
+
+  - Bug fixes:
+    Fix to calculating internal stats when frame dropped.
+    Fix to segfault for external resize test in vp9.
+    Fix to build system with replacing egrep with grep -E.
+    Fix to a few bugs with external RTC rate control library.
+    Fix to make SVC work with VBR.
+    Fix to key frame setting in VP9 external RC.
+    Fix to -Wimplicit-int (Clang 16).
+    Fix to VP8 external RC for buffer levels.
+    Fix to VP8 external RC for dynamic update of layers.
+    Fix to VP9 auto level.
+    Fix to off-by-one error of max w/h in validate_config.
+    Fix to make SVC work for Profile 1.
+
 2022-06-17 v1.12.0 "Torrent Duck"
   This release adds optimizations for Loongarch, adds support for vp8 in the
   real-time rate control library, upgrades GoogleTest to v1.11.0, updates

From aa5b62236a4d40679372c28ce37b95f965260aec Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 25 Jan 2023 19:25:12 -0500
Subject: [PATCH 511/926] Fix per frame qp for temporal layers

Also add tests with fixed temporal layering mode.

Change-Id: If516fe94e3fb7f5a745821d1788bfe6cf90edaac
(cherry picked from commit db69ce6aea278bee88668fd9cc2af2e544516fdb)
---
 test/vp9_datarate_test.cc          | 66 +++++++++++++++++++++++++-----
 vp9/encoder/vp9_svc_layercontext.c |  4 ++
 2 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index eccb001071..7e91807492 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -148,14 +148,16 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
       if (video->frame() == 0) {
         encoder->Control(VP9E_SET_SVC, 1);
       }
-      vpx_svc_layer_id_t layer_id;
-      layer_id.spatial_layer_id = 0;
-      frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id_per_spatial[0] =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+        vpx_svc_layer_id_t layer_id;
+        frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
+        layer_id.spatial_layer_id = 0;
+        layer_id.temporal_layer_id =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        layer_id.temporal_layer_id_per_spatial[0] =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      }
     }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
@@ -830,25 +832,37 @@ class DatarateTestVP9FrameQp
                                   ::libvpx_test::Encoder *encoder) {
     set_cpu_used_ = 7;
     DatarateTestVP9::PreEncodeFrameHook(video, encoder);
-    ACMRandom rnd;
-    frame_qp_ = static_cast<int>(rnd.RandRange(64));
+    frame_qp_ = static_cast<int>(rnd_.RandRange(64));
     encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_);
     frame_++;
   }
 
   virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
     int qp = 0;
+    vpx_svc_layer_id_t layer_id;
     if (frame_ >= total_frame_) return;
     encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp);
     ASSERT_EQ(frame_qp_, qp);
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    temporal_layer_id_ = layer_id.temporal_layer_id;
+  }
+
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {
+    if (frame_ >= total_frame_) return;
+    ASSERT_TRUE(cfg_.temporal_layering_mode ==
+                    VP9E_TEMPORAL_LAYERING_MODE_0212 &&
+                temporal_layer_id_ == 2);
   }
 
  protected:
   int total_frame_;
 
  private:
+  ACMRandom rnd_;
   int frame_qp_;
   int frame_;
+  int temporal_layer_id_;
 };
 
 TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
@@ -868,7 +882,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) {
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -897,6 +911,36 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_error_resilient = 1;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 // Params: speed setting.
 class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 518c00b34a..7e9435fb5f 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -894,6 +894,10 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
     RATE_CONTROL *const lrc = &lc->rc;
     lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
     lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q);
+    if (cpi->fixed_qp_onepass) {
+      lrc->worst_quality = cpi->rc.worst_quality;
+      lrc->best_quality = cpi->rc.best_quality;
+    }
   }
 
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&

From b5a2b3a92948a52f03719557a97d2c5e3617b0bb Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 1 Feb 2023 11:38:42 -0500
Subject: [PATCH 512/926] Update AUTHORS .mailmap and version

Bug: webm:1780
Change-Id: I75a24bdd076dc1746b23bababfaafccbce3b4214
---
 .mailmap | 1 +
 AUTHORS  | 3 +++
 libs.mk  | 4 ++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 3593de4b9c..bb0ddd95b2 100644
--- a/.mailmap
+++ b/.mailmap
@@ -25,6 +25,7 @@ Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
 Johann <johann@duck.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Martin Storsjö <martin@martin.st>
diff --git a/AUTHORS b/AUTHORS
index 536e0e7cf0..2db4a113e4 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -21,6 +21,7 @@ Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
+Andrew Salkeld <andrew.salkeld@arm.com>
 Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
 Anton Venema <anton.venema@liveswitch.com>
@@ -175,7 +176,9 @@ Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sai Deng <sdeng@google.com>
+Salome Thirot <salome.thirot@arm.com>
 Sami Pietilä <samipietila@google.com>
+Sam James <sam@gentoo.org>
 Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
diff --git a/libs.mk b/libs.mk
index fb6fbbeb20..1f7f03aa38 100644
--- a/libs.mk
+++ b/libs.mk
@@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 7
-SO_VERSION_MINOR := 1
+SO_VERSION_MAJOR := 8
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib

From 858a8c611f4c965078485860a6820e2135e6611b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 1 Feb 2023 13:27:06 -0800
Subject: [PATCH 513/926] vp9_diamond_search_sad_neon: use DECLARE_ALIGNED

rather than the gcc specific __attribute__((aligned())); fixes build
targeting ARM64 windows.

Bug: webm:1788
Change-Id: I2210fc215f44d90c1ce9dee9b54888eb1b78c99e
---
 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 33753f77b0..997775a668 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -220,7 +220,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       // Look up the component cost of the residual motion vector
       {
         uint32_t cost[4];
-        int16_t __attribute__((aligned(16))) rowcol[8];
+        DECLARE_ALIGNED(16, int16_t, rowcol[8]);
         vst1q_s16(rowcol, v_diff_mv_w);
 
         // Note: This is a use case for gather instruction

From d6382e4469e8864477139636207d0c056066e526 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Thu, 2 Feb 2023 16:30:09 -0800
Subject: [PATCH 514/926] Fix uninitialized mesh feature for BEST mode

At BEST encoding mode, the mesh search range wasn't initialized for
non FC_GRAPHICS_ANIMATION content type, which actually/mistakenly
used speed 0's setting. Fixed it by adding the initialization.

There were 2 ways to fix this. Patchset 1 set to use speed 0's setting
for non FC_GRAPHICS_ANIMATION type. This didn't change BEST mode's
encoding results much, and only a couple of clips' results were changed.

Borg result for BEST mode:
         avg_psnr:  ovr_psnr:  ssim:  encoding_spdup:
lowres2:  -0.004     -0.003   -0.000    0.030
midres2:  -0.006     -0.009   -0.012    0.033
hdres2:    0.002      0.002    0.004    0.015

Patchset 2 set to use BEST's setting for non FC_GRAPHICS_ANIMATION type.
However, the majority of test clips' BDrate got changed up to
~0.5% (gain or loss), and overall it didn't give better performance
than patchset 1. So, we chose to use patchset 1.

Change-Id: Ibbf578dad04420e6ba22cb9a3ddec137a7e4deef
---
 vp9/encoder/vp9_speed_features.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0431d8a452..41a742c5a1 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -16,8 +16,11 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 
 // Mesh search patters for various speed settings
-static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
-  { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
+// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
+// FC_GRAPHICS_ANIMATION content type.
+static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
+  { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
+  { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
 };
 
 #if !CONFIG_REALTIME_ONLY
@@ -991,10 +994,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->exhaustive_searches_thresh =
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
                                                               : INT_MAX;
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+  {
+    const int mesh_density_level =
+        (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
     for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
-      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+      sf->mesh_patterns[i].range =
+          best_quality_mesh_pattern[mesh_density_level][i].range;
+      sf->mesh_patterns[i].interval =
+          best_quality_mesh_pattern[mesh_density_level][i].interval;
     }
   }
 

From 18a3421b7df75468b2f3d6f69c8a1ae906dde3ae Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 3 Feb 2023 14:07:09 -0800
Subject: [PATCH 515/926] Set _img->bit_depth in y4m_input_fetch_frame()

This is a port of
https://aomedia-review.googlesource.com/c/aom/+/169961.

Change-Id: I2aa0d12cafde0c73448bf8c57eab0cd92e846468
---
 y4minput.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/y4minput.c b/y4minput.c
index 745e2f1cd6..210ce52fce 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -1148,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
   _img->fmt = _y4m->vpx_fmt;
   _img->w = _img->d_w = _y4m->pic_w;
   _img->h = _img->d_h = _y4m->pic_h;
+  _img->bit_depth = _y4m->bit_depth;
   _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
   _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
   _img->bps = _y4m->bps;

From e3028ddbb408381601ab8d2c67be37124a9726e5 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 1 Feb 2023 16:37:24 +0000
Subject: [PATCH 516/926] Optimize Neon implementation of high bitdepth SAD
 functions

Optimizations take a similar form to those implemented for standard
bitdepth SAD:

- Use ABD, UADALP instead of ABAL, ABAL2 (double the throughput on
  modern out-of-order Arm-designed cores.)
- Use more accumulator registers to make better use of Neon pipeline
  resources on Arm CPUs that have four Neon pipes.

Change-Id: I9e626d7fa0e271908dc43448405a7985b80e6230
---
 vpx_dsp/arm/highbd_sad_neon.c | 209 ++++++++++++++++++++++++----------
 1 file changed, 149 insertions(+), 60 deletions(-)

diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
index ecb52ce5a5..8415481f00 100644
--- a/vpx_dsp/arm/highbd_sad_neon.c
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -17,53 +17,169 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
-                                                  int src_stride,
-                                                  const uint8_t *ref_ptr,
-                                                  int ref_stride, int width,
-                                                  int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 4) {
-      const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
-      const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
-      sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
-    }
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    sum = vabal_u16(sum, s, r);
+
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint32x4(sum);
 }
 
-static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
-                                                  int src_stride,
-                                                  const uint8_t *ref_ptr,
-                                                  int ref_stride, int width,
-                                                  int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 8) {
-      const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
-      const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
-      sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
-                               vget_high_u16(ref_u16));
-    }
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t diff = vabdq_u16(s, r);
+    sum = vpadalq_u16(sum, diff);
+
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint16x8_t diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    diff0 = vabdq_u16(s0, r0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    diff1 = vabdq_u16(s1, r1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_uint32x4(sum[0]);
 }
 
+static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
+      uint16x8_t diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      diff0 = vabdq_u16(s0, r0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      diff1 = vabdq_u16(s1, r1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      diff2 = vabdq_u16(s2, r2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      diff3 = vabdq_u16(s3, r3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h) {
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h) {
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+#define HBD_SAD_WXH_NEON(w, h)                                            \
+  unsigned int vpx_highbd_sad##w##x##h##_neon(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,             \
+      int ref_stride) {                                                   \
+    return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+HBD_SAD_WXH_NEON(4, 4)
+HBD_SAD_WXH_NEON(4, 8)
+
+HBD_SAD_WXH_NEON(8, 4)
+HBD_SAD_WXH_NEON(8, 8)
+HBD_SAD_WXH_NEON(8, 16)
+
+HBD_SAD_WXH_NEON(16, 8)
+HBD_SAD_WXH_NEON(16, 16)
+HBD_SAD_WXH_NEON(16, 32)
+
+HBD_SAD_WXH_NEON(32, 16)
+HBD_SAD_WXH_NEON(32, 32)
+HBD_SAD_WXH_NEON(32, 64)
+
+HBD_SAD_WXH_NEON(64, 32)
+HBD_SAD_WXH_NEON(64, 64)
+
 static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, const uint8_t *second_pred, int width, int height) {
@@ -115,20 +231,6 @@ static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
   return horizontal_add_uint32x4(sum_abs_diff);
 }
 
-#define highbd_sad4MxN(m, n)                                                 \
-  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
-  }
-
-#define highbd_sadMxN(m, n)                                                  \
-  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
-  }
-
 #define highbd_sad4MxN_avg(m, n)                                          \
   unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
@@ -159,67 +261,54 @@ static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
 
 /* clang-format off */
 // 4x4
-highbd_sad4MxN(4, 4)
 highbd_sad4MxN_avg(4, 4)
 highbd_sadMxNx4D(4, 4)
 
 // 4x8
-highbd_sad4MxN(4, 8)
 highbd_sad4MxN_avg(4, 8)
 highbd_sadMxNx4D(4, 8)
 
 // 8x4
-highbd_sadMxN(8, 4)
 highbd_sadMxN_avg(8, 4)
 highbd_sadMxNx4D(8, 4)
 
 // 8x8
-highbd_sadMxN(8, 8)
 highbd_sadMxN_avg(8, 8)
 highbd_sadMxNx4D(8, 8)
 
 // 8x16
-highbd_sadMxN(8, 16)
 highbd_sadMxN_avg(8, 16)
 highbd_sadMxNx4D(8, 16)
 
 // 16x8
-highbd_sadMxN(16, 8)
 highbd_sadMxN_avg(16, 8)
 highbd_sadMxNx4D(16, 8)
 
 // 16x16
-highbd_sadMxN(16, 16)
 highbd_sadMxN_avg(16, 16)
 highbd_sadMxNx4D(16, 16)
 
 // 16x32
-highbd_sadMxN(16, 32)
 highbd_sadMxN_avg(16, 32)
 highbd_sadMxNx4D(16, 32)
 
 // 32x16
-highbd_sadMxN(32, 16)
 highbd_sadMxN_avg(32, 16)
 highbd_sadMxNx4D(32, 16)
 
 // 32x32
-highbd_sadMxN(32, 32)
 highbd_sadMxN_avg(32, 32)
 highbd_sadMxNx4D(32, 32)
 
 // 32x64
-highbd_sadMxN(32, 64)
 highbd_sadMxN_avg(32, 64)
 highbd_sadMxNx4D(32, 64)
 
 // 64x32
-highbd_sadMxN(64, 32)
 highbd_sadMxN_avg(64, 32)
 highbd_sadMxNx4D(64, 32)
 
 // 64x64
-highbd_sadMxN(64, 64)
 highbd_sadMxN_avg(64, 64)
 highbd_sadMxNx4D(64, 64)
     /* clang-format on */

From 9a5cbfbc087210eabfac5b0c2d72d12852ac56ae Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 3 Feb 2023 11:00:19 +0000
Subject: [PATCH 517/926] Optimize Neon implementation of high bitdepth avg SAD
 functions

Optimizations take a similar form to those implemented for standard
bitdepth averaging SAD:

- Use ABD, UADALP instead of ABAL, ABAL2 (double the throughput on
  modern out-of-order Arm-designed cores.)
- Use more accumulator registers to make better use of Neon pipeline
  resources on Arm CPUs that have four Neon pipes.

Change-Id: I75c5f09948f6bf17200f82e00e7a827a80451108
---
 vpx_dsp/arm/highbd_sad_neon.c | 244 +++++++++++++++++++++++++---------
 1 file changed, 181 insertions(+), 63 deletions(-)

diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
index 8415481f00..c76eb12b94 100644
--- a/vpx_dsp/arm/highbd_sad_neon.c
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -180,73 +180,204 @@ HBD_SAD_WXH_NEON(32, 64)
 HBD_SAD_WXH_NEON(64, 32)
 HBD_SAD_WXH_NEON(64, 64)
 
-static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
-    int ref_stride, const uint8_t *second_pred, int width, int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 4) {
-      const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
-      const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
-      const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
-      const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
-      sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
-    }
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    uint16x4_t p = vld1_u16(pred16_ptr);
+
+    uint16x4_t avg = vrhadd_u16(r, p);
+    sum = vabal_u16(sum, s, avg);
+
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
-    pred_ptr += width;
-  }
+    pred16_ptr += 4;
+  } while (--i != 0);
 
-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint32x4(sum);
 }
 
-static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
-    int ref_stride, const uint8_t *second_pred, int width, int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 8) {
-      const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
-      const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
-      const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
-      const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
-    }
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t p = vld1q_u16(pred16_ptr);
+
+    uint16x8_t avg = vrhaddq_u16(r, p);
+    uint16x8_t diff = vabdq_u16(s, avg);
+    sum = vpadalq_u16(sum, diff);
+
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
-    pred_ptr += width;
-  }
+    pred16_ptr += 8;
+  } while (--i != 0);
 
-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint32x4(sum);
 }
 
-#define highbd_sad4MxN_avg(m, n)                                          \
-  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
-      int ref_stride, const uint8_t *second_pred) {                       \
-    return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
-                                second_pred, m, n);                       \
-  }
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1, p0, p1;
+    uint16x8_t avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    p0 = vld1q_u16(pred16_ptr);
+    avg0 = vrhaddq_u16(r0, p0);
+    diff0 = vabdq_u16(s0, avg0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    p1 = vld1q_u16(pred16_ptr + 8);
+    avg1 = vrhaddq_u16(r1, p1);
+    diff1 = vabdq_u16(s1, avg1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_uint32x4(sum[0]);
+}
 
-#define highbd_sadMxN_avg(m, n)                                           \
-  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
-      int ref_stride, const uint8_t *second_pred) {                       \
-    return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
-                                second_pred, m, n);                       \
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+      uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      p0 = vld1q_u16(pred16_ptr + j);
+      avg0 = vrhaddq_u16(r0, p0);
+      diff0 = vabdq_u16(s0, avg0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      p1 = vld1q_u16(pred16_ptr + j + 8);
+      avg1 = vrhaddq_u16(r1, p1);
+      diff1 = vabdq_u16(s1, avg1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      p2 = vld1q_u16(pred16_ptr + j + 16);
+      avg2 = vrhaddq_u16(r2, p2);
+      diff2 = vabdq_u16(s2, avg2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      p3 = vld1q_u16(pred16_ptr + j + 24);
+      avg3 = vrhaddq_u16(r3, p3);
+      diff3 = vabdq_u16(s3, avg3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += w;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                                second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                                second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
+  uint32_t vpx_highbd_sad##w##x##h##_avg_neon(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
+                                      second_pred);                           \
   }
 
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+
 #define highbd_sadMxNx4D(m, n)                                                 \
   void vpx_highbd_sad##m##x##n##x4d_neon(                                      \
       const uint8_t *src_ptr, int src_stride,                                  \
@@ -261,54 +392,41 @@ static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
 
 /* clang-format off */
 // 4x4
-highbd_sad4MxN_avg(4, 4)
 highbd_sadMxNx4D(4, 4)
 
 // 4x8
-highbd_sad4MxN_avg(4, 8)
 highbd_sadMxNx4D(4, 8)
 
 // 8x4
-highbd_sadMxN_avg(8, 4)
 highbd_sadMxNx4D(8, 4)
 
 // 8x8
-highbd_sadMxN_avg(8, 8)
 highbd_sadMxNx4D(8, 8)
 
 // 8x16
-highbd_sadMxN_avg(8, 16)
 highbd_sadMxNx4D(8, 16)
 
 // 16x8
-highbd_sadMxN_avg(16, 8)
 highbd_sadMxNx4D(16, 8)
 
 // 16x16
-highbd_sadMxN_avg(16, 16)
 highbd_sadMxNx4D(16, 16)
 
 // 16x32
-highbd_sadMxN_avg(16, 32)
 highbd_sadMxNx4D(16, 32)
 
 // 32x16
-highbd_sadMxN_avg(32, 16)
 highbd_sadMxNx4D(32, 16)
 
 // 32x32
-highbd_sadMxN_avg(32, 32)
 highbd_sadMxNx4D(32, 32)
 
 // 32x64
-highbd_sadMxN_avg(32, 64)
 highbd_sadMxNx4D(32, 64)
 
 // 64x32
-highbd_sadMxN_avg(64, 32)
 highbd_sadMxNx4D(64, 32)
 
 // 64x64
-highbd_sadMxN_avg(64, 64)
 highbd_sadMxNx4D(64, 64)
     /* clang-format on */

From 5eea5c76669fefb07e5dcc4677942c28a5de4257 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 6 Feb 2023 13:29:58 -0500
Subject: [PATCH 518/926] Remove duplicated VPX_SCALING declaration

Use VPX_SCALING_MODE instead

Change-Id: Iab9d29f20838703e00bd9f7641035d8ebd69af53
---
 vp8/common/onyx.h         | 19 ++++++-------------
 vp8/encoder/firstpass.c   |  4 ++--
 vp8/encoder/onyx_if.c     | 20 +++++++++++---------
 vp8/vp8_cx_iface.c        |  4 ++--
 vp9/encoder/vp9_encoder.c | 16 ++++++++--------
 vp9/encoder/vp9_encoder.h | 11 ++---------
 vp9/vp9_cx_iface.c        |  5 ++---
 7 files changed, 33 insertions(+), 46 deletions(-)

diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 05c72df3fa..8c35e433e7 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -26,13 +26,6 @@ struct VP8_COMP;
 
 /* Create/destroy static data structures. */
 
-typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
 typedef enum {
   USAGE_LOCAL_FILE_PLAYBACK = 0x0,
   USAGE_STREAM_FROM_SERVER = 0x1,
@@ -58,19 +51,19 @@ typedef enum {
 #include <assert.h>
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
-    case ONETWO:
+    case VP8E_ONETWO:
       *hr = 1;
       *hs = 2;
       break;
@@ -273,8 +266,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int threshold[4]);
 int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
                        unsigned int rows, unsigned int cols);
-int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
 int vp8_get_quantizer(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 65d2681c91..4149fb4bf8 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -2990,8 +2990,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     /* Set back to unscaled by defaults */
-    cpi->common.horiz_scale = NORMAL;
-    cpi->common.vert_scale = NORMAL;
+    cpi->common.horiz_scale = VP8E_NORMAL;
+    cpi->common.vert_scale = VP8E_NORMAL;
 
     /* Calculate Average bits per frame. */
     av_bits_per_frame = cpi->oxcf.target_bandwidth /
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 4bbeadef01..bcf5227029 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1667,7 +1667,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 
   cm->sharpness_level = cpi->oxcf.Sharpness;
 
-  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
+  if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) {
     int hr, hs, vr, vs;
 
     Scale2Ratio(cm->horiz_scale, &hr, &hs);
@@ -2504,15 +2504,17 @@ static int resize_key_frame(VP8_COMP *cpi) {
     if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark *
                              cpi->oxcf.optimal_buffer_level / 100)) {
       cm->horiz_scale =
-          (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
-      cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+          (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO;
+      cm->vert_scale =
+          (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO;
     }
     /* Should we now start scaling back up */
     else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark *
                                   cpi->oxcf.optimal_buffer_level / 100)) {
       cm->horiz_scale =
-          (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
-      cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+          (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL;
+      cm->vert_scale =
+          (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL;
     }
 
     /* Get the new height and width */
@@ -5380,15 +5382,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   }
 }
 
-int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
-  if (horiz_mode <= ONETWO) {
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
+  if (horiz_mode <= VP8E_ONETWO) {
     cpi->common.horiz_scale = horiz_mode;
   } else {
     return -1;
   }
 
-  if (vert_mode <= ONETWO) {
+  if (vert_mode <= VP8E_ONETWO) {
     cpi->common.vert_scale = vert_mode;
   } else {
     return -1;
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 340f3e6638..a9d1f8005d 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1224,8 +1224,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
   if (data) {
     int res;
     vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode,
-                                (VPX_SCALING)scalemode.v_scaling_mode);
+    res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
+                                scalemode.v_scaling_mode);
 
     if (!res) {
       /*force next frame a key frame to effect scaling mode */
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b9fc148d7b..c14b56e24b 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -502,22 +502,22 @@ static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
   "Too many reference buffers are used."
 };
 
-static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
+static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
     default:
-      assert(mode == ONETWO);
+      assert(mode == VP8E_ONETWO);
       *hr = 1;
       *hs = 2;
       break;
@@ -8237,12 +8237,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
   }
 }
 
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
   VP9_COMMON *cm = &cpi->common;
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
-  if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+  if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1;
 
   Scale2Ratio(horiz_mode, &hr, &hs);
   Scale2Ratio(vert_mode, &vr, &vs);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 33a2844d3e..bdca8e58bf 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -90,13 +90,6 @@ typedef enum {
   ENCODE_BREAKOUT_LIMITED = 2
 } ENCODE_BREAKOUT_TYPE;
 
-typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
 typedef enum {
   // Good Quality Fast Encoding. The encoder balances quality with the amount of
   // time it takes to encode the output. Speed setting controls how fast.
@@ -1236,8 +1229,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
 int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols);
 
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
 
 int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height);
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index dee175dc09..4c7eaed725 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1675,9 +1675,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
   vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
 
   if (mode) {
-    const int res =
-        vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
-                              (VPX_SCALING)mode->v_scaling_mode);
+    const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode,
+                                          mode->v_scaling_mode);
     return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
   }
   return VPX_CODEC_INVALID_PARAM;

From 6b8e9e1f3eb5fe8420d49d0b4df146fb1e91e1cf Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 2 Feb 2023 16:06:38 +0000
Subject: [PATCH 519/926] Optimize Neon implementation of high bitdepth SAD4D
 functions

Optimizations take a similar form to those implemented for Armv8.0
standard bitdepth SAD4D:

- Use ABD, UADALP instead of ABAL, ABAL2 (double the throughput on
  modern out-of-order Arm-designed cores.)
- Use more accumulator registers to make better use of Neon pipeline
  resources on Arm CPUs that have four Neon pipes.
- Compute the four SAD sums in parallel so that we only load the source
  block once - instead of four times.

Change-Id: Ica45c44fd167e5fcc83871d8c138fc72ed3a9723
---
 vpx_dsp/arm/highbd_sad4d_neon.c | 248 ++++++++++++++++++++++++++++++++
 vpx_dsp/arm/highbd_sad_neon.c   |  53 -------
 vpx_dsp/vpx_dsp.mk              |   1 +
 3 files changed, 249 insertions(+), 53 deletions(-)
 create mode 100644 vpx_dsp/arm/highbd_sad4d_neon.c

diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c
new file mode 100644
index 0000000000..d5e9e8ad22
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE uint32x4_t horizontal_add_4d_u32(uint32x4_t sum[4]) {
+#if defined(__aarch64__)
+  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+  return vpaddq_u32(res01, res23);
+#else
+  uint32x4_t res = vdupq_n_u32(0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
+static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+    sum[3] = vabal_u16(sum[3], s, r3);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32(sum));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32(sum));
+}
+
+static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s0, s1;
+
+    s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4], int w,
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3;
+
+      s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+      s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+      s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+                &sum_lo[3]);
+
+      s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+                &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32(sum));
+}
+
+static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
+}
+
+static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
+}
+
+#define HBD_SAD_WXH_4D_NEON(w, h)                                            \
+  void vpx_highbd_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+                                         const uint8_t *const ref[4],        \
+                                         int ref_stride, uint32_t res[4]) {  \
+    highbd_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));   \
+  }
+
+HBD_SAD_WXH_4D_NEON(4, 4)
+HBD_SAD_WXH_4D_NEON(4, 8)
+
+HBD_SAD_WXH_4D_NEON(8, 4)
+HBD_SAD_WXH_4D_NEON(8, 8)
+HBD_SAD_WXH_4D_NEON(8, 16)
+
+HBD_SAD_WXH_4D_NEON(16, 8)
+HBD_SAD_WXH_4D_NEON(16, 16)
+HBD_SAD_WXH_4D_NEON(16, 32)
+
+HBD_SAD_WXH_4D_NEON(32, 16)
+HBD_SAD_WXH_4D_NEON(32, 32)
+HBD_SAD_WXH_4D_NEON(32, 64)
+
+HBD_SAD_WXH_4D_NEON(64, 32)
+HBD_SAD_WXH_4D_NEON(64, 64)
diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
index c76eb12b94..90971f6009 100644
--- a/vpx_dsp/arm/highbd_sad_neon.c
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -377,56 +377,3 @@ HBD_SAD_WXH_AVG_NEON(32, 64)
 
 HBD_SAD_WXH_AVG_NEON(64, 32)
 HBD_SAD_WXH_AVG_NEON(64, 64)
-
-#define highbd_sadMxNx4D(m, n)                                                 \
-  void vpx_highbd_sad##m##x##n##x4d_neon(                                      \
-      const uint8_t *src_ptr, int src_stride,                                  \
-      const uint8_t *const ref_array[4], int ref_stride,                       \
-      uint32_t sad_array[4]) {                                                 \
-    int i;                                                                     \
-    for (i = 0; i < 4; ++i) {                                                  \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride,       \
-                                                    ref_array[i], ref_stride); \
-    }                                                                          \
-  }
-
-/* clang-format off */
-// 4x4
-highbd_sadMxNx4D(4, 4)
-
-// 4x8
-highbd_sadMxNx4D(4, 8)
-
-// 8x4
-highbd_sadMxNx4D(8, 4)
-
-// 8x8
-highbd_sadMxNx4D(8, 8)
-
-// 8x16
-highbd_sadMxNx4D(8, 16)
-
-// 16x8
-highbd_sadMxNx4D(16, 8)
-
-// 16x16
-highbd_sadMxNx4D(16, 16)
-
-// 16x32
-highbd_sadMxNx4D(16, 32)
-
-// 32x16
-highbd_sadMxNx4D(32, 16)
-
-// 32x32
-highbd_sadMxNx4D(32, 32)
-
-// 32x64
-highbd_sadMxNx4D(32, 64)
-
-// 64x32
-highbd_sadMxNx4D(64, 32)
-
-// 64x64
-highbd_sadMxNx4D(64, 64)
-    /* clang-format on */
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 1fd9495cf9..3b04e97651 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c

From ec8e2fe1cfda967b78c7068404455348357c7e7d Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Mon, 6 Feb 2023 14:48:34 -0800
Subject: [PATCH 520/926] Move TPL to a new file

This is a refactoring CL.

Change-Id: Ic8c1575601d27f14ecd1b1bf0a038e447eaae458
---
 vp9/encoder/vp9_encoder.c   | 1429 +----------------------------------
 vp9/encoder/vp9_encoder.h   |   13 +
 vp9/encoder/vp9_tpl_model.c | 1400 ++++++++++++++++++++++++++++++++++
 vp9/encoder/vp9_tpl_model.h |   44 ++
 vp9/vp9cx.mk                |    2 +
 5 files changed, 1466 insertions(+), 1422 deletions(-)
 create mode 100644 vp9/encoder/vp9_tpl_model.c
 create mode 100644 vp9/encoder/vp9_tpl_model.h

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b9fc148d7b..f8d1d64672 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -34,16 +34,12 @@
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_idct.h"
-#if CONFIG_NON_GREEDY_MV
-#include "vp9/common/vp9_mvref_common.h"
-#endif
 #if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_tile_common.h"
-#include "vp9/common/vp9_scan.h"
 
 #if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_alt_ref_aq.h"
@@ -81,6 +77,7 @@
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_tpl_model.h"
 #include "vp9/vp9_cx_iface.h"
 
 #define AM_SEGMENT_ID_INACTIVE 7
@@ -126,13 +123,6 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
 }
 #endif
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size);
-#endif
-void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                  TX_SIZE tx_size);
-
 #if !CONFIG_REALTIME_ONLY
 // compute adaptive threshold for skip recoding
 static int compute_context_model_thresh(const VP9_COMP *const cpi) {
@@ -2109,11 +2099,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   vp9_set_row_mt(cpi);
 }
 
-#ifndef M_LOG2_E
-#define M_LOG2_E 0.693147180559945309417
-#endif
-#define log2f(x) (log(x) / (float)M_LOG2_E)
-
 /***********************************************************************
  * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts'    *
  ***********************************************************************
@@ -2666,8 +2651,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
 #endif  // CONFIG_INTERNAL_STATS
 
-static void free_tpl_buffer(VP9_COMP *cpi);
-
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *cm;
   unsigned int i;
@@ -2781,7 +2764,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
     vpx_free(cpi->kmeans_data_arr);
   }
 
-  free_tpl_buffer(cpi);
+  vp9_free_tpl_buffer(cpi);
 
   vp9_loop_filter_dealloc(&cpi->lf_row_sync);
   vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
@@ -3329,19 +3312,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   vpx_extend_frame_inner_borders(cm->frame_to_show);
 }
 
-static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
-  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
-  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
-      new_fb_ptr->mi_cols < cm->mi_cols) {
-    vpx_free(new_fb_ptr->mvs);
-    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
-                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                         sizeof(*new_fb_ptr->mvs)));
-    new_fb_ptr->mi_rows = cm->mi_rows;
-    new_fb_ptr->mi_cols = cm->mi_cols;
-  }
-}
-
 void vp9_scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
@@ -5302,16 +5272,16 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) {
         vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size,
                                   mb_buffer, buf_stride, zero_pred, block_size,
                                   xd->bd);
-        highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+        vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
       } else {
         vpx_subtract_block(block_size, block_size, src_diff, block_size,
                            mb_buffer, buf_stride, zero_pred, block_size);
-        wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+        vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
       }
 #else
       vpx_subtract_block(block_size, block_size, src_diff, block_size,
                          mb_buffer, buf_stride, zero_pred, block_size);
-      wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
       coeff[0] = 0;
@@ -6229,1391 +6199,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   }
 }
 
-typedef struct GF_PICTURE {
-  YV12_BUFFER_CONFIG *frame;
-  int ref_frame[3];
-  FRAME_UPDATE_TYPE update_type;
-} GF_PICTURE;
-
-static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
-                            const GF_GROUP *gf_group, int *tpl_group_frames) {
-  VP9_COMMON *cm = &cpi->common;
-  int frame_idx = 0;
-  int i;
-  int gld_index = -1;
-  int alt_index = -1;
-  int lst_index = -1;
-  int arf_index_stack[MAX_ARF_LAYERS];
-  int arf_stack_size = 0;
-  int extend_frame_count = 0;
-  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
-  int frame_gop_offset = 0;
-
-  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
-  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
-
-  memset(recon_frame_index, -1, sizeof(recon_frame_index));
-  stack_init(arf_index_stack, MAX_ARF_LAYERS);
-
-  // TODO(jingning): To be used later for gf frame type parsing.
-  (void)gf_group;
-
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    if (frame_bufs[i].ref_count == 0) {
-      alloc_frame_mvs(cm, i);
-      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
-                                   cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                   cm->use_highbitdepth,
-#endif
-                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                                   NULL, NULL, NULL))
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate frame buffer");
-
-      recon_frame_index[frame_idx] = i;
-      ++frame_idx;
-
-      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
-    }
-  }
-
-  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
-    assert(recon_frame_index[i] >= 0);
-    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
-  }
-
-  *tpl_group_frames = 0;
-
-  // Initialize Golden reference frame.
-  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-  for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
-  gf_picture[0].update_type = gf_group->update_type[0];
-  gld_index = 0;
-  ++*tpl_group_frames;
-
-  // Initialize base layer ARF frame
-  gf_picture[1].frame = cpi->Source;
-  gf_picture[1].ref_frame[0] = gld_index;
-  gf_picture[1].ref_frame[1] = lst_index;
-  gf_picture[1].ref_frame[2] = alt_index;
-  gf_picture[1].update_type = gf_group->update_type[1];
-  alt_index = 1;
-  ++*tpl_group_frames;
-
-  // Initialize P frames
-  for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
-    struct lookahead_entry *buf;
-    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
-    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
-
-    if (buf == NULL) break;
-
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
-
-    switch (gf_group->update_type[frame_idx]) {
-      case ARF_UPDATE:
-        stack_push(arf_index_stack, alt_index, arf_stack_size);
-        ++arf_stack_size;
-        alt_index = frame_idx;
-        break;
-      case LF_UPDATE: lst_index = frame_idx; break;
-      case OVERLAY_UPDATE:
-        gld_index = frame_idx;
-        alt_index = stack_pop(arf_index_stack, arf_stack_size);
-        --arf_stack_size;
-        break;
-      case USE_BUF_FRAME:
-        lst_index = alt_index;
-        alt_index = stack_pop(arf_index_stack, arf_stack_size);
-        --arf_stack_size;
-        break;
-      default: break;
-    }
-
-    ++*tpl_group_frames;
-
-    // The length of group of pictures is baseline_gf_interval, plus the
-    // beginning golden frame from last GOP, plus the last overlay frame in
-    // the same GOP.
-    if (frame_idx == gf_group->gf_group_size) break;
-  }
-
-  alt_index = -1;
-  ++frame_idx;
-  ++frame_gop_offset;
-
-  // Extend two frames outside the current gf group.
-  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
-    struct lookahead_entry *buf =
-        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
-
-    if (buf == NULL) break;
-
-    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
-
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    gf_picture[frame_idx].update_type = LF_UPDATE;
-    lst_index = frame_idx;
-    ++*tpl_group_frames;
-    ++extend_frame_count;
-    ++frame_gop_offset;
-  }
-}
-
-static void init_tpl_stats(VP9_COMP *cpi) {
-  int frame_idx;
-  for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    memset(tpl_frame->tpl_stats_ptr, 0,
-           tpl_frame->height * tpl_frame->width *
-               sizeof(*tpl_frame->tpl_stats_ptr));
-    tpl_frame->is_valid = 0;
-  }
-}
-
-#if CONFIG_NON_GREEDY_MV
-static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
-                                         MotionField *motion_field,
-                                         int frame_idx, uint8_t *cur_frame_buf,
-                                         uint8_t *ref_frame_buf, int stride,
-                                         BLOCK_SIZE bsize, int mi_row,
-                                         int mi_col, MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  int step_param;
-  uint32_t bestsme = UINT_MAX;
-  const MvLimits tmp_mv_limits = x->mv_limits;
-  // lambda is used to adjust the importance of motion vector consistency.
-  // TODO(angiebird): Figure out lambda's proper value.
-  const int lambda = cpi->tpl_stats[frame_idx].lambda;
-  int_mv nb_full_mvs[NB_MVS_NUM];
-  int nb_full_mv_num;
-
-  MV best_ref_mv1 = { 0, 0 };
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  step_param = mv_sf->reduce_first_step_size;
-  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
-  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
-  nb_full_mv_num =
-      vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
-  vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
-                             lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
-
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
-  return bestsme;
-}
-
-static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
-                                        uint8_t *cur_frame_buf,
-                                        uint8_t *ref_frame_buf, int stride,
-                                        BLOCK_SIZE bsize, MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  uint32_t bestsme = UINT_MAX;
-  uint32_t distortion;
-  uint32_t sse;
-  int cost_list[5];
-
-  MV best_ref_mv1 = { 0, 0 };
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  // TODO(yunqing): may use higher tap interp filter than 2 taps.
-  // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(
-      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
-      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
-      USE_2_TAPS);
-
-  return bestsme;
-}
-
-#else  // CONFIG_NON_GREEDY_MV
-static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
-                                              uint8_t *cur_frame_buf,
-                                              uint8_t *ref_frame_buf,
-                                              int stride, BLOCK_SIZE bsize,
-                                              MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS search_method = NSTEP;
-  int step_param;
-  int sadpb = x->sadperbit16;
-  uint32_t bestsme = UINT_MAX;
-  uint32_t distortion;
-  uint32_t sse;
-  int cost_list[5];
-  const MvLimits tmp_mv_limits = x->mv_limits;
-
-  MV best_ref_mv1 = { 0, 0 };
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  step_param = mv_sf->reduce_first_step_size;
-  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
-  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
-  vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
-                        search_method, sadpb, cond_cost_list(cpi, cost_list),
-                        &best_ref_mv1, mv, 0, 0);
-
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
-  // TODO(yunqing): may use higher tap interp filter than 2 taps.
-  // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(
-      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
-      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
-      USE_2_TAPS);
-
-  return bestsme;
-}
-#endif
-
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
-                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
-  int width = 0, height = 0;
-  int bw = 4 << b_width_log2_lookup[bsize];
-  int bh = 4 << b_height_log2_lookup[bsize];
-
-  switch (block) {
-    case 0:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 1:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 2:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    case 3:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    default: assert(0);
-  }
-
-  return width * height;
-}
-
-static int round_floor(int ref_pos, int bsize_pix) {
-  int round;
-  if (ref_pos < 0)
-    round = -(1 + (-ref_pos - 1) / bsize_pix);
-  else
-    round = ref_pos / bsize_pix;
-
-  return round;
-}
-
-static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, int stride) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
-  int idx, idy;
-
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
-      const int64_t mc_flow = tpl_ptr->mc_flow;
-      const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
-      *tpl_ptr = *src_stats;
-      tpl_ptr->mc_flow = mc_flow;
-      tpl_ptr->mc_ref_cost = mc_ref_cost;
-      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
-    }
-  }
-}
-
-static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
-  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
-  MV mv = tpl_stats->mv.as_mv;
-  int mv_row = mv.row >> 3;
-  int mv_col = mv.col >> 3;
-
-  int ref_pos_row = mi_row * MI_SIZE + mv_row;
-  int ref_pos_col = mi_col * MI_SIZE + mv_col;
-
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int pix_num = bw * bh;
-
-  // top-left on grid block location in pixel
-  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
-  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
-  int block;
-
-  for (block = 0; block < 4; ++block) {
-    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
-    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
-
-    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
-        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
-      int overlap_area = get_overlap_area(
-          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
-      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
-      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
-
-      int64_t mc_flow = tpl_stats->mc_dep_cost -
-                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
-                            tpl_stats->intra_cost;
-
-      int idx, idy;
-
-      for (idy = 0; idy < mi_height; ++idy) {
-        for (idx = 0; idx < mi_width; ++idx) {
-          TplDepStats *des_stats =
-              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
-                         (ref_mi_col + idx)];
-
-          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
-          des_stats->mc_ref_cost +=
-              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
-              pix_num;
-          assert(overlap_area >= 0);
-        }
-      }
-    }
-  }
-}
-
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  int idx, idy;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr =
-          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
-      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
-                         BLOCK_8X8);
-    }
-  }
-}
-
-static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
-                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                               TX_SIZE tx_size, int64_t *recon_error,
-                               int64_t *sse) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  uint16_t eob;
-  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
-  const int shift = tx_size == TX_32X32 ? 0 : 2;
-
-  // skip block condition should be handled before this is called.
-  assert(!x->skip_block);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
-                                 qcoeff, dqcoeff, pd->dequant, &eob,
-                                 scan_order->scan, scan_order->iscan);
-  } else {
-    vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                          dqcoeff, pd->dequant, &eob, scan_order->scan,
-                          scan_order->iscan);
-  }
-#else
-  vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                        dqcoeff, pd->dequant, &eob, scan_order->scan,
-                        scan_order->iscan);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
-  *recon_error = VPXMAX(*recon_error, 1);
-
-  *sse = (*sse) >> shift;
-  *sse = VPXMAX(*sse, 1);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size) {
-  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
-  switch (tx_size) {
-    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
-    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
-    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                  TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
-    case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
-    case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
-    default: assert(0);
-  }
-}
-
-static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
-                          int mi_col) {
-  x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
-  x->mv_limits.row_max =
-      (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
-  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
-  x->mv_limits.col_max =
-      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
-}
-
-static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
-                            struct scale_factors *sf, GF_PICTURE *gf_picture,
-                            int frame_idx, TplDepFrame *tpl_frame,
-                            int16_t *src_diff, tran_low_t *coeff,
-                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
-                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
-                            int64_t *recon_error, int64_t *sse) {
-  VP9_COMMON *cm = &cpi->common;
-  ThreadData *td = &cpi->td;
-
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int pix_num = bw * bh;
-  int best_rf_idx = -1;
-  int_mv best_mv;
-  int64_t best_inter_cost = INT64_MAX;
-  int64_t inter_cost;
-  int rf_idx;
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
-
-  int64_t best_intra_cost = INT64_MAX;
-  int64_t intra_cost;
-  PREDICTION_MODE mode;
-  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  MODE_INFO mi_above, mi_left;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  TplDepStats *tpl_stats =
-      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
-  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
-  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
-
-  // Intra prediction search
-  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-    uint8_t *src, *dst;
-    int src_stride, dst_stride;
-
-    src = xd->cur_buf->y_buffer + mb_y_offset;
-    src_stride = xd->cur_buf->y_stride;
-
-    dst = &predictor[0];
-    dst_stride = bw;
-
-    xd->mi[0]->sb_type = bsize;
-    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-
-    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
-                            src_stride, dst, dst_stride, 0, 0, 0);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                                dst_stride, xd->bd);
-      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      intra_cost = vpx_highbd_satd(coeff, pix_num);
-    } else {
-      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                         dst_stride);
-      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      intra_cost = vpx_satd(coeff, pix_num);
-    }
-#else
-    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-    intra_cost = vpx_satd(coeff, pix_num);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
-  }
-
-  // Motion compensated prediction
-  best_mv.as_int = 0;
-
-  set_mv_limits(cm, x, mi_row, mi_col);
-
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    int_mv mv;
-#if CONFIG_NON_GREEDY_MV
-    MotionField *motion_field;
-#endif
-    if (ref_frame[rf_idx] == NULL) continue;
-
-#if CONFIG_NON_GREEDY_MV
-    (void)td;
-    motion_field = vp9_motion_field_info_get_motion_field(
-        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-    mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-#else
-    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
-                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                                  xd->cur_buf->y_stride, bsize, &mv.as_mv);
-#endif
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      vp9_highbd_build_inter_predictor(
-          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
-          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
-          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
-          mi_row * MI_SIZE, xd->bd);
-      vpx_highbd_subtract_block(
-          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
-          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
-      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      inter_cost = vpx_highbd_satd(coeff, pix_num);
-    } else {
-      vp9_build_inter_predictor(
-          ref_frame[rf_idx]->y_buffer + mb_y_offset,
-          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
-          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
-      vpx_subtract_block(bh, bw, src_diff, bw,
-                         xd->cur_buf->y_buffer + mb_y_offset,
-                         xd->cur_buf->y_stride, &predictor[0], bw);
-      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      inter_cost = vpx_satd(coeff, pix_num);
-    }
-#else
-    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
-                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
-                              mi_col * MI_SIZE, mi_row * MI_SIZE);
-    vpx_subtract_block(bh, bw, src_diff, bw,
-                       xd->cur_buf->y_buffer + mb_y_offset,
-                       xd->cur_buf->y_stride, &predictor[0], bw);
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-    inter_cost = vpx_satd(coeff, pix_num);
-#endif
-
-    if (inter_cost < best_inter_cost) {
-      best_rf_idx = rf_idx;
-      best_inter_cost = inter_cost;
-      best_mv.as_int = mv.as_int;
-      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
-                         sse);
-    }
-  }
-  best_intra_cost = VPXMAX(best_intra_cost, 1);
-  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
-  tpl_stats->inter_cost = VPXMAX(
-      1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
-  tpl_stats->intra_cost = VPXMAX(
-      1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
-  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
-  tpl_stats->mv.as_int = best_mv.as_int;
-}
-
-#if CONFIG_NON_GREEDY_MV
-static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
-                                  int frame_idx, int rf_idx, int mi_row,
-                                  int mi_col, struct buf_2d *src,
-                                  struct buf_2d *pre) {
-  const int mb_y_offset =
-      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  YV12_BUFFER_CONFIG *ref_frame = NULL;
-  int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-  if (ref_frame_idx != -1) {
-    ref_frame = gf_picture[ref_frame_idx].frame;
-    src->buf = xd->cur_buf->y_buffer + mb_y_offset;
-    src->stride = xd->cur_buf->y_stride;
-    pre->buf = ref_frame->y_buffer + mb_y_offset;
-    pre->stride = ref_frame->y_stride;
-    assert(src->stride == pre->stride);
-    return 1;
-  } else {
-    printf("invalid ref_frame_idx");
-    assert(ref_frame_idx != -1);
-    return 0;
-  }
-}
-
-#define kMvPreCheckLines 5
-#define kMvPreCheckSize 15
-
-#define MV_REF_POS_NUM 3
-POSITION mv_ref_pos[MV_REF_POS_NUM] = {
-  { -1, 0 },
-  { 0, -1 },
-  { -1, -1 },
-};
-
-static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
-                             int mi_col) {
-  return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
-}
-
-static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
-                          BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  int i;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int_mv nearest_mv, near_mv, invalid_mv;
-  nearest_mv.as_int = INVALID_MV;
-  near_mv.as_int = INVALID_MV;
-  invalid_mv.as_int = INVALID_MV;
-  for (i = 0; i < MV_REF_POS_NUM; ++i) {
-    int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
-    int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
-    assert(mv_ref_pos[i].row <= 0);
-    assert(mv_ref_pos[i].col <= 0);
-    if (nb_row >= 0 && nb_col >= 0) {
-      if (nearest_mv.as_int == INVALID_MV) {
-        nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
-      } else {
-        int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
-        if (mv.as_int == nearest_mv.as_int) {
-          continue;
-        } else {
-          near_mv = mv;
-          break;
-        }
-      }
-    }
-  }
-  if (nearest_mv.as_int == INVALID_MV) {
-    nearest_mv.as_mv.row = 0;
-    nearest_mv.as_mv.col = 0;
-  }
-  if (near_mv.as_int == INVALID_MV) {
-    near_mv.as_mv.row = 0;
-    near_mv.as_mv.col = 0;
-  }
-  if (mv_mode == NEAREST_MV_MODE) {
-    return nearest_mv;
-  }
-  if (mv_mode == NEAR_MV_MODE) {
-    return near_mv;
-  }
-  assert(0);
-  return invalid_mv;
-}
-
-static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
-                                  MotionField *motion_field,
-                                  TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
-                                  int mi_row, int mi_col) {
-  int_mv mv;
-  switch (mv_mode) {
-    case ZERO_MV_MODE:
-      mv.as_mv.row = 0;
-      mv.as_mv.col = 0;
-      break;
-    case NEW_MV_MODE:
-      mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-      break;
-    case NEAREST_MV_MODE:
-      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
-      break;
-    case NEAR_MV_MODE:
-      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
-      break;
-    default:
-      mv.as_int = INVALID_MV;
-      assert(0);
-      break;
-  }
-  return mv;
-}
-
-static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
-                          GF_PICTURE *gf_picture, MotionField *motion_field,
-                          int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                          BLOCK_SIZE bsize, int mi_row, int mi_col,
-                          int_mv *mv) {
-  uint32_t sse;
-  struct buf_2d src;
-  struct buf_2d pre;
-  MV full_mv;
-  *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
-                            mi_row, mi_col);
-  full_mv = get_full_mv(&mv->as_mv);
-  if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
-                             &src, &pre)) {
-    // TODO(angiebird): Consider subpixel when computing the sse.
-    cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
-                          pre.stride, &sse);
-    return (double)(sse << VP9_DIST_SCALE_LOG2);
-  } else {
-    assert(0);
-    return 0;
-  }
-}
-
-static int get_mv_mode_cost(int mv_mode) {
-  // TODO(angiebird): The probabilities are roughly inferred from
-  // default_inter_mode_probs. Check if there is a better way to set the
-  // probabilities.
-  const int zero_mv_prob = 16;
-  const int new_mv_prob = 24 * 1;
-  const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
-  assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
-  switch (mv_mode) {
-    case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
-    case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
-    case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
-    case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
-    default: assert(0); return -1;
-  }
-}
-
-static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
-  double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
-                        log2(1 + abs(new_mv->col - ref_mv->col));
-  mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
-  return mv_diff_cost;
-}
-static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
-                          TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
-                          int mi_col) {
-  double mv_cost = get_mv_mode_cost(mv_mode);
-  if (mv_mode == NEW_MV_MODE) {
-    MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
-                                    bsize, mi_row, mi_col)
-                    .as_mv;
-    MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
-                                        tpl_frame, bsize, mi_row, mi_col)
-                        .as_mv;
-    MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
-                                     bsize, mi_row, mi_col)
-                     .as_mv;
-    double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
-    double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
-    mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
-  }
-  return mv_cost;
-}
-
-static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
-                           GF_PICTURE *gf_picture, MotionField *motion_field,
-                           int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                           BLOCK_SIZE bsize, int mi_row, int mi_col,
-                           int_mv *mv) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  double mv_dist =
-      get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
-                  tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
-  double mv_cost =
-      get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
-  double mult = 180;
-
-  return mv_cost + mult * log2f(1 + mv_dist);
-}
-
-static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 GF_PICTURE *gf_picture,
-                                 MotionField *motion_field, int frame_idx,
-                                 TplDepFrame *tpl_frame, int rf_idx,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                 double *rd, int_mv *mv) {
-  int best_mv_mode = ZERO_MV_MODE;
-  int update = 0;
-  int mv_mode;
-  *rd = 0;
-  for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
-    double this_rd;
-    int_mv this_mv;
-    if (mv_mode == NEW_MV_MODE) {
-      continue;
-    }
-    this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
-                           tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
-    if (update == 0) {
-      *rd = this_rd;
-      *mv = this_mv;
-      best_mv_mode = mv_mode;
-      update = 1;
-    } else {
-      if (this_rd < *rd) {
-        *rd = this_rd;
-        *mv = this_mv;
-        best_mv_mode = mv_mode;
-      }
-    }
-  }
-  return best_mv_mode;
-}
-
-static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            GF_PICTURE *gf_picture, MotionField *motion_field,
-                            int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                            BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int tmp_mv_mode_arr[kMvPreCheckSize];
-  int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
-  double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
-  int_mv *select_mv_arr = cpi->select_mv_arr;
-  int_mv tmp_select_mv_arr[kMvPreCheckSize];
-  int stride = tpl_frame->stride;
-  double new_mv_rd = 0;
-  double no_new_mv_rd = 0;
-  double this_new_mv_rd = 0;
-  double this_no_new_mv_rd = 0;
-  int idx;
-  int tmp_idx;
-  assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
-
-  // no new mv
-  // diagonal scan order
-  tmp_idx = 0;
-  for (idx = 0; idx < kMvPreCheckLines; ++idx) {
-    int r;
-    for (r = 0; r <= idx; ++r) {
-      int c = idx - r;
-      int nb_row = mi_row + r * mi_height;
-      int nb_col = mi_col + c * mi_width;
-      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-        double this_rd;
-        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
-        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
-            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
-            bsize, nb_row, nb_col, &this_rd, mv);
-        if (r == 0 && c == 0) {
-          this_no_new_mv_rd = this_rd;
-        }
-        no_new_mv_rd += this_rd;
-        tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
-        tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
-        ++tmp_idx;
-      }
-    }
-  }
-
-  // new mv
-  mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
-  this_new_mv_rd = eval_mv_mode(
-      NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
-      rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
-  new_mv_rd = this_new_mv_rd;
-  // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
-  // beforehand.
-  for (idx = 1; idx < kMvPreCheckLines; ++idx) {
-    int r;
-    for (r = 0; r <= idx; ++r) {
-      int c = idx - r;
-      int nb_row = mi_row + r * mi_height;
-      int nb_col = mi_col + c * mi_width;
-      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-        double this_rd;
-        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
-        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
-            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
-            bsize, nb_row, nb_col, &this_rd, mv);
-        new_mv_rd += this_rd;
-      }
-    }
-  }
-
-  // update best_mv_mode
-  tmp_idx = 0;
-  if (no_new_mv_rd < new_mv_rd) {
-    for (idx = 0; idx < kMvPreCheckLines; ++idx) {
-      int r;
-      for (r = 0; r <= idx; ++r) {
-        int c = idx - r;
-        int nb_row = mi_row + r * mi_height;
-        int nb_col = mi_col + c * mi_width;
-        if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-          mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
-          select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
-          ++tmp_idx;
-        }
-      }
-    }
-    rd_diff_arr[mi_row * stride + mi_col] = 0;
-  } else {
-    rd_diff_arr[mi_row * stride + mi_col] =
-        (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
-  }
-}
-
-static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
-                                GF_PICTURE *gf_picture,
-                                MotionField *motion_field, int frame_idx,
-                                TplDepFrame *tpl_frame, int rf_idx,
-                                BLOCK_SIZE bsize) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int unit_rows = tpl_frame->mi_rows / mi_height;
-  const int unit_cols = tpl_frame->mi_cols / mi_width;
-  const int max_diagonal_lines = unit_rows + unit_cols - 1;
-  int idx;
-  for (idx = 0; idx < max_diagonal_lines; ++idx) {
-    int r;
-    for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
-         ++r) {
-      int c = idx - r;
-      int mi_row = r * mi_height;
-      int mi_col = c * mi_width;
-      assert(c >= 0 && c < unit_cols);
-      assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
-      assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
-      predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
-                      rf_idx, bsize, mi_row, mi_col);
-    }
-  }
-}
-
-static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
-                             MotionField *motion_field, int frame_idx,
-                             YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
-                             int mi_row, int mi_col) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int mb_y_offset =
-      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  assert(ref_frame != NULL);
-  set_mv_limits(cm, x, mi_row, mi_col);
-  {
-    int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-    uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
-    uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
-    const int stride = xd->cur_buf->y_stride;
-    full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
-                             ref_frame_buf, stride, bsize, mi_row, mi_col,
-                             &mv.as_mv);
-    sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
-                            bsize, &mv.as_mv);
-    vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
-  }
-}
-
-static void build_motion_field(
-    VP9_COMP *cpi, int frame_idx,
-    YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
-  VP9_COMMON *cm = &cpi->common;
-  ThreadData *td = &cpi->td;
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
-  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
-  int mi_row, mi_col;
-  int rf_idx;
-
-  tpl_frame->lambda = (pw * ph) >> 2;
-  assert(pw * ph == tpl_frame->lambda << 2);
-
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    MotionField *motion_field = vp9_motion_field_info_get_motion_field(
-        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-    if (ref_frame[rf_idx] == NULL) {
-      continue;
-    }
-    vp9_motion_field_reset_mvs(motion_field);
-    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-        do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
-                         bsize, mi_row, mi_col);
-      }
-    }
-  }
-}
-#endif  // CONFIG_NON_GREEDY_MV
-
-static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
-                              int frame_idx, BLOCK_SIZE bsize) {
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
-  YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
-
-  VP9_COMMON *cm = &cpi->common;
-  struct scale_factors sf;
-  int rdmult, idx;
-  ThreadData *td = &cpi->td;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int mi_row, mi_col;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
-  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
-  uint8_t *predictor;
-#else
-  DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
-#endif
-  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int64_t recon_error, sse;
-#if CONFIG_NON_GREEDY_MV
-  int square_block_idx;
-  int rf_idx;
-#endif
-
-  // Setup scaling factor
-#if CONFIG_VP9_HIGHBITDEPTH
-  vp9_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
-      this_frame->y_crop_width, this_frame->y_crop_height,
-      cpi->common.use_highbitdepth);
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    predictor = CONVERT_TO_BYTEPTR(predictor16);
-  else
-    predictor = predictor8;
-#else
-  vp9_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
-      this_frame->y_crop_width, this_frame->y_crop_height);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  // Prepare reference frame pointers. If any reference frame slot is
-  // unavailable, the pointer will be set to Null.
-  for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
-    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
-    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
-  }
-
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
-  xd->cur_buf = this_frame;
-
-  // Get rd multiplier set up.
-  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
-  set_error_per_bit(&cpi->td.mb, rdmult);
-  vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
-
-  tpl_frame->is_valid = 1;
-
-  cm->base_qindex = tpl_frame->base_qindex;
-  vp9_frame_init_quantizer(cpi);
-
-#if CONFIG_NON_GREEDY_MV
-  for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
-       ++square_block_idx) {
-    BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
-    build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
-  }
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-    if (ref_frame_idx != -1) {
-      MotionField *motion_field = vp9_motion_field_info_get_motion_field(
-          &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-      predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
-                          tpl_frame, rf_idx, bsize);
-    }
-  }
-#endif
-
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
-                      src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
-                      tx_size, ref_frame, predictor, &recon_error, &sse);
-      // Motion flow dependency dispenser.
-      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
-                      tpl_frame->stride);
-
-      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
-                       bsize);
-    }
-  }
-}
-
-#if CONFIG_NON_GREEDY_MV
-#define DUMP_TPL_STATS 0
-#if DUMP_TPL_STATS
-static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
-  int i, j;
-  printf("%d %d\n", h, w);
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      printf("%d ", buf[(row + i) * stride + col + j]);
-    }
-  }
-  printf("\n");
-}
-
-static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
-  dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
-           frame_buf->y_width);
-  dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
-           frame_buf->uv_height, frame_buf->uv_width);
-  dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
-           frame_buf->uv_height, frame_buf->uv_width);
-}
-
-static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
-                           const GF_GROUP *gf_group,
-                           const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
-  int frame_idx;
-  const VP9_COMMON *cm = &cpi->common;
-  int rf_idx;
-  for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-      int mi_row, mi_col;
-      int ref_frame_idx;
-      const int mi_height = num_8x8_blocks_high_lookup[bsize];
-      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-      ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-      if (ref_frame_idx != -1) {
-        YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
-        const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
-        const int ref_gf_frame_offset =
-            gf_group->frame_gop_index[ref_frame_idx];
-        printf("=\n");
-        printf(
-            "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
-            "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
-            frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
-            ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
-        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
-          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
-              int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
-                                                       frame_idx, rf_idx, bsize,
-                                                       mi_row, mi_col);
-              printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
-                     mv.as_mv.col);
-            }
-          }
-        }
-        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
-          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
-              const TplDepStats *tpl_ptr =
-                  &tpl_frame
-                       ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-              printf("%f ", tpl_ptr->feature_score);
-            }
-          }
-        }
-        printf("\n");
-
-        for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-          for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-            const int mv_mode =
-                tpl_frame
-                    ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
-            printf("%d ", mv_mode);
-          }
-        }
-        printf("\n");
-
-        dump_frame_buf(gf_picture[frame_idx].frame);
-        dump_frame_buf(ref_frame_buf);
-      }
-    }
-  }
-}
-#endif  // DUMP_TPL_STATS
-#endif  // CONFIG_NON_GREEDY_MV
-
-static void init_tpl_buffer(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int frame;
-
-  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
-#if CONFIG_NON_GREEDY_MV
-  int rf_idx;
-
-  vpx_free(cpi->select_mv_arr);
-  CHECK_MEM_ERROR(
-      cm, cpi->select_mv_arr,
-      vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
-#endif
-
-  // TODO(jingning): Reduce the actual memory use for tpl model build up.
-  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
-    if (cpi->tpl_stats[frame].width >= mi_cols &&
-        cpi->tpl_stats[frame].height >= mi_rows &&
-        cpi->tpl_stats[frame].tpl_stats_ptr)
-      continue;
-
-#if CONFIG_NON_GREEDY_MV
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
-      CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
-          vpx_calloc(mi_rows * mi_cols * 4,
-                     sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
-      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
-      CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
-          vpx_calloc(mi_rows * mi_cols * 4,
-                     sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
-    }
-#endif
-    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
-                    vpx_calloc(mi_rows * mi_cols,
-                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
-    cpi->tpl_stats[frame].is_valid = 0;
-    cpi->tpl_stats[frame].width = mi_cols;
-    cpi->tpl_stats[frame].height = mi_rows;
-    cpi->tpl_stats[frame].stride = mi_cols;
-    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
-    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
-  }
-
-  for (frame = 0; frame < REF_FRAMES; ++frame) {
-    cpi->enc_frame_buf[frame].mem_valid = 0;
-    cpi->enc_frame_buf[frame].released = 1;
-  }
-}
-
-static void free_tpl_buffer(VP9_COMP *cpi) {
-  int frame;
-#if CONFIG_NON_GREEDY_MV
-  vp9_free_motion_field_info(&cpi->motion_field_info);
-  vpx_free(cpi->select_mv_arr);
-#endif
-  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
-#if CONFIG_NON_GREEDY_MV
-    int rf_idx;
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
-      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
-    }
-#endif
-    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    cpi->tpl_stats[frame].is_valid = 0;
-  }
-}
-
-#if CONFIG_RATE_CTRL
-static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  int show_frame_count = 0;
-  int frame_idx;
-  // Accumulate tpl stats for each frame in the current group of picture.
-  for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-    const int tpl_stride = tpl_frame->stride;
-    int64_t intra_cost_base = 0;
-    int64_t inter_cost_base = 0;
-    int64_t mc_dep_cost_base = 0;
-    int64_t mc_ref_cost_base = 0;
-    int64_t mc_flow_base = 0;
-    int row, col;
-
-    if (!tpl_frame->is_valid) continue;
-
-    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
-      for (col = 0; col < cm->mi_cols; ++col) {
-        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
-        intra_cost_base += this_stats->intra_cost;
-        inter_cost_base += this_stats->inter_cost;
-        mc_dep_cost_base += this_stats->mc_dep_cost;
-        mc_ref_cost_base += this_stats->mc_ref_cost;
-        mc_flow_base += this_stats->mc_flow;
-      }
-    }
-
-    cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
-    cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
-    cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
-    cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
-    cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
-
-    ++show_frame_count;
-  }
-}
-#endif  // CONFIG_RATE_CTRL
-
-static void setup_tpl_stats(VP9_COMP *cpi) {
-  GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  int tpl_group_frames = 0;
-  int frame_idx;
-  cpi->tpl_bsize = BLOCK_32X32;
-
-  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
-
-  init_tpl_stats(cpi);
-
-  // Backward propagation from tpl_group_frames to 1.
-  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
-    if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
-    mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
-  }
-#if CONFIG_NON_GREEDY_MV
-  cpi->tpl_ready = 1;
-#if DUMP_TPL_STATS
-  dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
-#endif  // DUMP_TPL_STATS
-#endif  // CONFIG_NON_GREEDY_MV
-
-#if CONFIG_RATE_CTRL
-  if (cpi->oxcf.use_simple_encode_api) {
-    accumulate_frame_tpl_stats(cpi);
-  }
-#endif  // CONFIG_RATE_CTRL
-}
-
 void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
                             RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
                             int *ref_frame_coding_indexes,
@@ -7906,9 +6491,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (gf_group_index == 1 &&
       cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
       cpi->sf.enable_tpl_model) {
-    init_tpl_buffer(cpi);
+    vp9_init_tpl_buffer(cpi);
     vp9_estimate_qp_gop(cpi);
-    setup_tpl_stats(cpi);
+    vp9_setup_tpl_stats(cpi);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, setup_tpl_stats_time);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 33a2844d3e..02c7400fbc 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1474,6 +1474,19 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
+static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                         sizeof(*new_fb_ptr->mvs)));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
 static INLINE void start_timing(VP9_COMP *cpi, int component) {
   vpx_usec_timer_start(&cpi->component_timer[component]);
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
new file mode 100644
index 0000000000..b0c735167e
--- /dev/null
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -0,0 +1,1400 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/common/vp9_mvref_common.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_tpl_model.h"
+
+static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                            const GF_GROUP *gf_group, int *tpl_group_frames) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame_idx = 0;
+  int i;
+  int gld_index = -1;
+  int alt_index = -1;
+  int lst_index = -1;
+  int arf_index_stack[MAX_ARF_LAYERS];
+  int arf_stack_size = 0;
+  int extend_frame_count = 0;
+  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+  int frame_gop_offset = 0;
+
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+
+  memset(recon_frame_index, -1, sizeof(recon_frame_index));
+  stack_init(arf_index_stack, MAX_ARF_LAYERS);
+
+  // TODO(jingning): To be used later for gf frame type parsing.
+  (void)gf_group;
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, i);
+      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+
+      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
+    }
+  }
+
+  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
+  *tpl_group_frames = 0;
+
+  // Initialize Golden reference frame.
+  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
+  gf_picture[0].update_type = gf_group->update_type[0];
+  gld_index = 0;
+  ++*tpl_group_frames;
+
+  // Initialize base layer ARF frame
+  gf_picture[1].frame = cpi->Source;
+  gf_picture[1].ref_frame[0] = gld_index;
+  gf_picture[1].ref_frame[1] = lst_index;
+  gf_picture[1].ref_frame[2] = alt_index;
+  gf_picture[1].update_type = gf_group->update_type[1];
+  alt_index = 1;
+  ++*tpl_group_frames;
+
+  // Initialize P frames
+  for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    struct lookahead_entry *buf;
+    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
+    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
+
+    switch (gf_group->update_type[frame_idx]) {
+      case ARF_UPDATE:
+        stack_push(arf_index_stack, alt_index, arf_stack_size);
+        ++arf_stack_size;
+        alt_index = frame_idx;
+        break;
+      case LF_UPDATE: lst_index = frame_idx; break;
+      case OVERLAY_UPDATE:
+        gld_index = frame_idx;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      case USE_BUF_FRAME:
+        lst_index = alt_index;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      default: break;
+    }
+
+    ++*tpl_group_frames;
+
+    // The length of group of pictures is baseline_gf_interval, plus the
+    // beginning golden frame from last GOP, plus the last overlay frame in
+    // the same GOP.
+    if (frame_idx == gf_group->gf_group_size) break;
+  }
+
+  alt_index = -1;
+  ++frame_idx;
+  ++frame_gop_offset;
+
+  // Extend two frames outside the current gf group.
+  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+    struct lookahead_entry *buf =
+        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = LF_UPDATE;
+    lst_index = frame_idx;
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+    ++frame_gop_offset;
+  }
+}
+
+static void init_tpl_stats(VP9_COMP *cpi) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    memset(tpl_frame->tpl_stats_ptr, 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+    tpl_frame->is_valid = 0;
+  }
+}
+
+#if CONFIG_NON_GREEDY_MV
+static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+                                         MotionField *motion_field,
+                                         int frame_idx, uint8_t *cur_frame_buf,
+                                         uint8_t *ref_frame_buf, int stride,
+                                         BLOCK_SIZE bsize, int mi_row,
+                                         int mi_col, MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  int step_param;
+  uint32_t bestsme = UINT_MAX;
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  // lambda is used to adjust the importance of motion vector consistency.
+  // TODO(angiebird): Figure out lambda's proper value.
+  const int lambda = cpi->tpl_stats[frame_idx].lambda;
+  int_mv nb_full_mvs[NB_MVS_NUM];
+  int nb_full_mv_num;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  nb_full_mv_num =
+      vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
+  vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                             lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  return bestsme;
+}
+
+static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+                                        uint8_t *cur_frame_buf,
+                                        uint8_t *ref_frame_buf, int stride,
+                                        BLOCK_SIZE bsize, MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+
+  MV best_ref_mv1 = { 0, 0 };
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+
+#else  // CONFIG_NON_GREEDY_MV
+static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
+                                              uint8_t *cur_frame_buf,
+                                              uint8_t *ref_frame_buf,
+                                              int stride, BLOCK_SIZE bsize,
+                                              MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS search_method = NSTEP;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                        search_method, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, mv, 0, 0);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+#endif
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
+  int width = 0, height = 0;
+  int bw = 4 << b_width_log2_lookup[bsize];
+  int bh = 4 << b_height_log2_lookup[bsize];
+
+  switch (block) {
+    case 0:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 1:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 2:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    case 3:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    default: assert(0);
+  }
+
+  return width * height;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, int stride) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
+      const int64_t mc_flow = tpl_ptr->mc_flow;
+      const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
+      *tpl_ptr = *src_stats;
+      tpl_ptr->mc_flow = mc_flow;
+      tpl_ptr->mc_ref_cost = mc_ref_cost;
+      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+    }
+  }
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+  MV mv = tpl_stats->mv.as_mv;
+  int mv_row = mv.row >> 3;
+  int mv_col = mv.col >> 3;
+
+  int ref_pos_row = mi_row * MI_SIZE + mv_row;
+  int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = get_overlap_area(
+          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+      int64_t mc_flow = tpl_stats->mc_dep_cost -
+                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+                            tpl_stats->intra_cost;
+
+      int idx, idy;
+
+      for (idy = 0; idy < mi_height; ++idy) {
+        for (idx = 0; idx < mi_width; ++idx) {
+          TplDepStats *des_stats =
+              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+                         (ref_mi_col + idx)];
+
+          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+          des_stats->mc_ref_cost +=
+              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
+              pix_num;
+          assert(overlap_area >= 0);
+        }
+      }
+    }
+  }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  int idx, idy;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr =
+          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+                         BLOCK_8X8);
+    }
+  }
+}
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               TX_SIZE tx_size, int64_t *recon_error,
+                               int64_t *sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  uint16_t eob;
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
+                                 qcoeff, dqcoeff, pd->dequant, &eob,
+                                 scan_order->scan, scan_order->iscan);
+  } else {
+    vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
+                          dqcoeff, pd->dequant, &eob, scan_order->scan,
+                          scan_order->iscan);
+  }
+#else
+  vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
+                        dqcoeff, pd->dequant, &eob, scan_order->scan,
+                        scan_order->iscan);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  *recon_error = VPXMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = VPXMAX(*sse, 1);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size) {
+  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
+  switch (tx_size) {
+    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+
+static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+                          int mi_col) {
+  x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.row_max =
+      (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.col_max =
+      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+}
+
+static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                            struct scale_factors *sf, GF_PICTURE *gf_picture,
+                            int frame_idx, TplDepFrame *tpl_frame,
+                            int16_t *src_diff, tran_low_t *coeff,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+                            int64_t *recon_error, int64_t *sse) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int pix_num = bw * bh;
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t best_inter_cost = INT64_MAX;
+  int64_t inter_cost;
+  int rf_idx;
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE mode;
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  MODE_INFO mi_above, mi_left;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  TplDepStats *tpl_stats =
+      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
+  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
+
+  // Intra prediction search
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    uint8_t *src, *dst;
+    int src_stride, dst_stride;
+
+    src = xd->cur_buf->y_buffer + mb_y_offset;
+    src_stride = xd->cur_buf->y_stride;
+
+    dst = &predictor[0];
+    dst_stride = bw;
+
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
+                            src_stride, dst, dst_stride, 0, 0, 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                                dst_stride, xd->bd);
+      vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                         dst_stride);
+      vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
+    vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    intra_cost = vpx_satd(coeff, pix_num);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+  }
+
+  // Motion compensated prediction
+  best_mv.as_int = 0;
+
+  set_mv_limits(cm, x, mi_row, mi_col);
+
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    int_mv mv;
+#if CONFIG_NON_GREEDY_MV
+    MotionField *motion_field;
+#endif
+    if (ref_frame[rf_idx] == NULL) continue;
+
+#if CONFIG_NON_GREEDY_MV
+    (void)td;
+    motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+    mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+#else
+    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                                  xd->cur_buf->y_stride, bsize, &mv.as_mv);
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor(
+          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
+          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
+          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
+          mi_row * MI_SIZE, xd->bd);
+      vpx_highbd_subtract_block(
+          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+      vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vp9_build_inter_predictor(
+          ref_frame[rf_idx]->y_buffer + mb_y_offset,
+          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
+          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+      vpx_subtract_block(bh, bw, src_diff, bw,
+                         xd->cur_buf->y_buffer + mb_y_offset,
+                         xd->cur_buf->y_stride, &predictor[0], bw);
+      vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
+                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+    vpx_subtract_block(bh, bw, src_diff, bw,
+                       xd->cur_buf->y_buffer + mb_y_offset,
+                       xd->cur_buf->y_stride, &predictor[0], bw);
+    vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    inter_cost = vpx_satd(coeff, pix_num);
+#endif
+
+    if (inter_cost < best_inter_cost) {
+      best_rf_idx = rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv.as_int = mv.as_int;
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+                         sse);
+    }
+  }
+  best_intra_cost = VPXMAX(best_intra_cost, 1);
+  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = VPXMAX(
+      1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->intra_cost = VPXMAX(
+      1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  tpl_stats->mv.as_int = best_mv.as_int;
+}
+
+#if CONFIG_NON_GREEDY_MV
+static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
+                                  int frame_idx, int rf_idx, int mi_row,
+                                  int mi_col, struct buf_2d *src,
+                                  struct buf_2d *pre) {
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  YV12_BUFFER_CONFIG *ref_frame = NULL;
+  int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+  if (ref_frame_idx != -1) {
+    ref_frame = gf_picture[ref_frame_idx].frame;
+    src->buf = xd->cur_buf->y_buffer + mb_y_offset;
+    src->stride = xd->cur_buf->y_stride;
+    pre->buf = ref_frame->y_buffer + mb_y_offset;
+    pre->stride = ref_frame->y_stride;
+    assert(src->stride == pre->stride);
+    return 1;
+  } else {
+    printf("invalid ref_frame_idx");
+    assert(ref_frame_idx != -1);
+    return 0;
+  }
+}
+
+#define kMvPreCheckLines 5
+#define kMvPreCheckSize 15
+
+#define MV_REF_POS_NUM 3
+POSITION mv_ref_pos[MV_REF_POS_NUM] = {
+  { -1, 0 },
+  { 0, -1 },
+  { -1, -1 },
+};
+
+static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
+                             int mi_col) {
+  return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
+}
+
+static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  int i;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int_mv nearest_mv, near_mv, invalid_mv;
+  nearest_mv.as_int = INVALID_MV;
+  near_mv.as_int = INVALID_MV;
+  invalid_mv.as_int = INVALID_MV;
+  for (i = 0; i < MV_REF_POS_NUM; ++i) {
+    int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
+    int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
+    assert(mv_ref_pos[i].row <= 0);
+    assert(mv_ref_pos[i].col <= 0);
+    if (nb_row >= 0 && nb_col >= 0) {
+      if (nearest_mv.as_int == INVALID_MV) {
+        nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+      } else {
+        int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+        if (mv.as_int == nearest_mv.as_int) {
+          continue;
+        } else {
+          near_mv = mv;
+          break;
+        }
+      }
+    }
+  }
+  if (nearest_mv.as_int == INVALID_MV) {
+    nearest_mv.as_mv.row = 0;
+    nearest_mv.as_mv.col = 0;
+  }
+  if (near_mv.as_int == INVALID_MV) {
+    near_mv.as_mv.row = 0;
+    near_mv.as_mv.col = 0;
+  }
+  if (mv_mode == NEAREST_MV_MODE) {
+    return nearest_mv;
+  }
+  if (mv_mode == NEAR_MV_MODE) {
+    return near_mv;
+  }
+  assert(0);
+  return invalid_mv;
+}
+
+static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
+                                  MotionField *motion_field,
+                                  TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
+                                  int mi_row, int mi_col) {
+  int_mv mv;
+  switch (mv_mode) {
+    case ZERO_MV_MODE:
+      mv.as_mv.row = 0;
+      mv.as_mv.col = 0;
+      break;
+    case NEW_MV_MODE:
+      mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+      break;
+    case NEAREST_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    case NEAR_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    default:
+      mv.as_int = INVALID_MV;
+      assert(0);
+      break;
+  }
+  return mv;
+}
+
+static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
+                          GF_PICTURE *gf_picture, MotionField *motion_field,
+                          int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col,
+                          int_mv *mv) {
+  uint32_t sse;
+  struct buf_2d src;
+  struct buf_2d pre;
+  MV full_mv;
+  *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
+                            mi_row, mi_col);
+  full_mv = get_full_mv(&mv->as_mv);
+  if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
+                             &src, &pre)) {
+    // TODO(angiebird): Consider subpixel when computing the sse.
+    cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
+                          pre.stride, &sse);
+    return (double)(sse << VP9_DIST_SCALE_LOG2);
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+static int get_mv_mode_cost(int mv_mode) {
+  // TODO(angiebird): The probabilities are roughly inferred from
+  // default_inter_mode_probs. Check if there is a better way to set the
+  // probabilities.
+  const int zero_mv_prob = 16;
+  const int new_mv_prob = 24 * 1;
+  const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
+  assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
+  switch (mv_mode) {
+    case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
+    case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
+    case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    default: assert(0); return -1;
+  }
+}
+
+static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
+  double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
+                        log2(1 + abs(new_mv->col - ref_mv->col));
+  mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
+  return mv_diff_cost;
+}
+static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
+                          TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col) {
+  double mv_cost = get_mv_mode_cost(mv_mode);
+  if (mv_mode == NEW_MV_MODE) {
+    MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
+                                    bsize, mi_row, mi_col)
+                    .as_mv;
+    MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
+                                        tpl_frame, bsize, mi_row, mi_col)
+                        .as_mv;
+    MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
+                                     bsize, mi_row, mi_col)
+                     .as_mv;
+    double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
+    double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
+    mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
+  }
+  return mv_cost;
+}
+
+static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
+                           GF_PICTURE *gf_picture, MotionField *motion_field,
+                           int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col,
+                           int_mv *mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  double mv_dist =
+      get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
+                  tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
+  double mv_cost =
+      get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
+  double mult = 180;
+
+  return mv_cost + mult * log2f(1 + mv_dist);
+}
+
+static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 GF_PICTURE *gf_picture,
+                                 MotionField *motion_field, int frame_idx,
+                                 TplDepFrame *tpl_frame, int rf_idx,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 double *rd, int_mv *mv) {
+  int best_mv_mode = ZERO_MV_MODE;
+  int update = 0;
+  int mv_mode;
+  *rd = 0;
+  for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
+    double this_rd;
+    int_mv this_mv;
+    if (mv_mode == NEW_MV_MODE) {
+      continue;
+    }
+    this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
+                           tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
+    if (update == 0) {
+      *rd = this_rd;
+      *mv = this_mv;
+      best_mv_mode = mv_mode;
+      update = 1;
+    } else {
+      if (this_rd < *rd) {
+        *rd = this_rd;
+        *mv = this_mv;
+        best_mv_mode = mv_mode;
+      }
+    }
+  }
+  return best_mv_mode;
+}
+
+static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            GF_PICTURE *gf_picture, MotionField *motion_field,
+                            int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                            BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int tmp_mv_mode_arr[kMvPreCheckSize];
+  int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
+  double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
+  int_mv *select_mv_arr = cpi->select_mv_arr;
+  int_mv tmp_select_mv_arr[kMvPreCheckSize];
+  int stride = tpl_frame->stride;
+  double new_mv_rd = 0;
+  double no_new_mv_rd = 0;
+  double this_new_mv_rd = 0;
+  double this_no_new_mv_rd = 0;
+  int idx;
+  int tmp_idx;
+  assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
+
+  // no new mv
+  // diagonal scan order
+  tmp_idx = 0;
+  for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+            bsize, nb_row, nb_col, &this_rd, mv);
+        if (r == 0 && c == 0) {
+          this_no_new_mv_rd = this_rd;
+        }
+        no_new_mv_rd += this_rd;
+        tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
+        tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
+        ++tmp_idx;
+      }
+    }
+  }
+
+  // new mv
+  mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
+  this_new_mv_rd = eval_mv_mode(
+      NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+      rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
+  new_mv_rd = this_new_mv_rd;
+  // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
+  // beforehand.
+  for (idx = 1; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+            bsize, nb_row, nb_col, &this_rd, mv);
+        new_mv_rd += this_rd;
+      }
+    }
+  }
+
+  // update best_mv_mode
+  tmp_idx = 0;
+  if (no_new_mv_rd < new_mv_rd) {
+    for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+      int r;
+      for (r = 0; r <= idx; ++r) {
+        int c = idx - r;
+        int nb_row = mi_row + r * mi_height;
+        int nb_col = mi_col + c * mi_width;
+        if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+          mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
+          select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
+          ++tmp_idx;
+        }
+      }
+    }
+    rd_diff_arr[mi_row * stride + mi_col] = 0;
+  } else {
+    rd_diff_arr[mi_row * stride + mi_col] =
+        (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
+  }
+}
+
+static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
+                                GF_PICTURE *gf_picture,
+                                MotionField *motion_field, int frame_idx,
+                                TplDepFrame *tpl_frame, int rf_idx,
+                                BLOCK_SIZE bsize) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int unit_rows = tpl_frame->mi_rows / mi_height;
+  const int unit_cols = tpl_frame->mi_cols / mi_width;
+  const int max_diagonal_lines = unit_rows + unit_cols - 1;
+  int idx;
+  for (idx = 0; idx < max_diagonal_lines; ++idx) {
+    int r;
+    for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
+         ++r) {
+      int c = idx - r;
+      int mi_row = r * mi_height;
+      int mi_col = c * mi_width;
+      assert(c >= 0 && c < unit_cols);
+      assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
+      assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
+      predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+                      rf_idx, bsize, mi_row, mi_col);
+    }
+  }
+}
+
+static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
+                             MotionField *motion_field, int frame_idx,
+                             YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
+                             int mi_row, int mi_col) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  assert(ref_frame != NULL);
+  set_mv_limits(cm, x, mi_row, mi_col);
+  {
+    int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+    uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
+    uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
+    const int stride = xd->cur_buf->y_stride;
+    full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
+                             ref_frame_buf, stride, bsize, mi_row, mi_col,
+                             &mv.as_mv);
+    sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
+                            bsize, &mv.as_mv);
+    vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
+  }
+}
+
+static void build_motion_field(
+    VP9_COMP *cpi, int frame_idx,
+    YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
+  int mi_row, mi_col;
+  int rf_idx;
+
+  tpl_frame->lambda = (pw * ph) >> 2;
+  assert(pw * ph == tpl_frame->lambda << 2);
+
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+    if (ref_frame[rf_idx] == NULL) {
+      continue;
+    }
+    vp9_motion_field_reset_mvs(motion_field);
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+        do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
+                         bsize, mi_row, mi_col);
+      }
+    }
+  }
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
+static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                              int frame_idx, BLOCK_SIZE bsize) {
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+  YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
+
+  VP9_COMMON *cm = &cpi->common;
+  struct scale_factors sf;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_row, mi_col;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
+#endif
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int64_t recon_error, sse;
+#if CONFIG_NON_GREEDY_MV
+  int square_block_idx;
+  int rf_idx;
+#endif
+
+  // Setup scaling factor
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height,
+      cpi->common.use_highbitdepth);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  else
+    predictor = predictor8;
+#else
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Prepare reference frame pointers. If any reference frame slot is
+  // unavailable, the pointer will be set to Null.
+  for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
+    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  xd->cur_buf = this_frame;
+
+  // Get rd multiplier set up.
+  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->base_qindex = tpl_frame->base_qindex;
+  vp9_frame_init_quantizer(cpi);
+
+#if CONFIG_NON_GREEDY_MV
+  for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+       ++square_block_idx) {
+    BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
+    build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
+  }
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+    if (ref_frame_idx != -1) {
+      MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+          &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+      predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
+                          tpl_frame, rf_idx, bsize);
+    }
+  }
+#endif
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
+                      src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
+                      tx_size, ref_frame, predictor, &recon_error, &sse);
+      // Motion flow dependency dispenser.
+      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                      tpl_frame->stride);
+
+      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+                       bsize);
+    }
+  }
+}
+
+#if CONFIG_NON_GREEDY_MV
+#define DUMP_TPL_STATS 0
+#if DUMP_TPL_STATS
+static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
+  int i, j;
+  printf("%d %d\n", h, w);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      printf("%d ", buf[(row + i) * stride + col + j]);
+    }
+  }
+  printf("\n");
+}
+
+static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
+  dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
+           frame_buf->y_width);
+  dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+  dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+}
+
+static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
+                           const GF_GROUP *gf_group,
+                           const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
+  int frame_idx;
+  const VP9_COMMON *cm = &cpi->common;
+  int rf_idx;
+  for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+      int mi_row, mi_col;
+      int ref_frame_idx;
+      const int mi_height = num_8x8_blocks_high_lookup[bsize];
+      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+      ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
+        const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
+        const int ref_gf_frame_offset =
+            gf_group->frame_gop_index[ref_frame_idx];
+        printf("=\n");
+        printf(
+            "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
+            "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
+            frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
+            ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
+                                                       frame_idx, rf_idx, bsize,
+                                                       mi_row, mi_col);
+              printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
+                     mv.as_mv.col);
+            }
+          }
+        }
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              const TplDepStats *tpl_ptr =
+                  &tpl_frame
+                       ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+              printf("%f ", tpl_ptr->feature_score);
+            }
+          }
+        }
+        printf("\n");
+
+        for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+          for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+            const int mv_mode =
+                tpl_frame
+                    ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
+            printf("%d ", mv_mode);
+          }
+        }
+        printf("\n");
+
+        dump_frame_buf(gf_picture[frame_idx].frame);
+        dump_frame_buf(ref_frame_buf);
+      }
+    }
+  }
+}
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame;
+
+  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_NON_GREEDY_MV
+  int rf_idx;
+
+  vpx_free(cpi->select_mv_arr);
+  CHECK_MEM_ERROR(
+      cm, cpi->select_mv_arr,
+      vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
+#endif
+
+  // TODO(jingning): Reduce the actual memory use for tpl model build up.
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+    if (cpi->tpl_stats[frame].width >= mi_cols &&
+        cpi->tpl_stats[frame].height >= mi_rows &&
+        cpi->tpl_stats[frame].tpl_stats_ptr)
+      continue;
+
+#if CONFIG_NON_GREEDY_MV
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
+                    vpx_calloc(mi_rows * mi_cols,
+                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+    cpi->tpl_stats[frame].is_valid = 0;
+    cpi->tpl_stats[frame].width = mi_cols;
+    cpi->tpl_stats[frame].height = mi_rows;
+    cpi->tpl_stats[frame].stride = mi_cols;
+    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
+    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+  }
+
+  for (frame = 0; frame < REF_FRAMES; ++frame) {
+    cpi->enc_frame_buf[frame].mem_valid = 0;
+    cpi->enc_frame_buf[frame].released = 1;
+  }
+}
+
+void vp9_free_tpl_buffer(VP9_COMP *cpi) {
+  int frame;
+#if CONFIG_NON_GREEDY_MV
+  vp9_free_motion_field_info(&cpi->motion_field_info);
+  vpx_free(cpi->select_mv_arr);
+#endif
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+#if CONFIG_NON_GREEDY_MV
+    int rf_idx;
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    cpi->tpl_stats[frame].is_valid = 0;
+  }
+}
+
+#if CONFIG_RATE_CTRL
+static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int show_frame_count = 0;
+  int frame_idx;
+  // Accumulate tpl stats for each frame in the current group of picture.
+  for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+    const int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t inter_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    int64_t mc_ref_cost_base = 0;
+    int64_t mc_flow_base = 0;
+    int row, col;
+
+    if (!tpl_frame->is_valid) continue;
+
+    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+      for (col = 0; col < cm->mi_cols; ++col) {
+        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+        intra_cost_base += this_stats->intra_cost;
+        inter_cost_base += this_stats->inter_cost;
+        mc_dep_cost_base += this_stats->mc_dep_cost;
+        mc_ref_cost_base += this_stats->mc_ref_cost;
+        mc_flow_base += this_stats->mc_flow;
+      }
+    }
+
+    cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
+    cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
+
+    ++show_frame_count;
+  }
+}
+#endif  // CONFIG_RATE_CTRL
+
+void vp9_setup_tpl_stats(VP9_COMP *cpi) {
+  GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int tpl_group_frames = 0;
+  int frame_idx;
+  cpi->tpl_bsize = BLOCK_32X32;
+
+  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
+
+  init_tpl_stats(cpi);
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
+    if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
+    mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
+  }
+#if CONFIG_NON_GREEDY_MV
+  cpi->tpl_ready = 1;
+#if DUMP_TPL_STATS
+  dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+
+#if CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    accumulate_frame_tpl_stats(cpi);
+  }
+#endif  // CONFIG_RATE_CTRL
+}
diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h
new file mode 100644
index 0000000000..86a7734f82
--- /dev/null
+++ b/vp9/encoder/vp9_tpl_model.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+typedef struct GF_PICTURE {
+  YV12_BUFFER_CONFIG *frame;
+  int ref_frame[3];
+  FRAME_UPDATE_TYPE update_type;
+} GF_PICTURE;
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi);
+void vp9_setup_tpl_stats(VP9_COMP *cpi);
+void vp9_free_tpl_buffer(VP9_COMP *cpi);
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 9072628f23..ae8fb85d87 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -104,6 +104,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
 endif
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 

From 25a6b2b181db9896070b2fc1239dc28ba6f3115d Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 7 Feb 2023 11:28:15 +0000
Subject: [PATCH 521/926] Use 4D reduction Neon helper for standard bitdepth
 SAD4D

Move the 4D reduction helper function to sum_neon.h and use this for
both standard and high bitdepth SAD4D paths. This also removes the
AArch64 requirement for using the UDOT Neon SAD4D paths.

Change-Id: I207f76b3d42aa541809b0672c3b3d86e54d133ff
---
 vpx_dsp/arm/highbd_sad4d_neon.c | 23 ++++----------------
 vpx_dsp/arm/sad4d_neon.c        | 37 ++++++++++++++++-----------------
 vpx_dsp/arm/sum_neon.h          | 16 ++++++++++++++
 3 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c
index d5e9e8ad22..f731d38cc1 100644
--- a/vpx_dsp/arm/highbd_sad4d_neon.c
+++ b/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -17,21 +17,6 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE uint32x4_t horizontal_add_4d_u32(uint32x4_t sum[4]) {
-#if defined(__aarch64__)
-  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
-  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
-  return vpaddq_u32(res01, res23);
-#else
-  uint32x4_t res = vdupq_n_u32(0);
-  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
-  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
-  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
-  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
-  return res;
-#endif
-}
-
 static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
                                          const uint8_t *const ref_ptr[4],
                                          int ref_stride, uint32_t res[4],
@@ -60,7 +45,7 @@ static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
 
   } while (++i < h);
 
-  vst1q_u32(res, horizontal_add_4d_u32(sum));
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }
 
 static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
@@ -93,7 +78,7 @@ static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
 
   } while (++i < h);
 
-  vst1q_u32(res, horizontal_add_4d_u32(sum));
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }
 
 static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
@@ -136,7 +121,7 @@ static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
   sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
   sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
 
-  vst1q_u32(res, horizontal_add_4d_u32(sum));
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }
 
 static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
@@ -203,7 +188,7 @@ static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
   sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
   sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
 
-  vst1q_u32(res, horizontal_add_4d_u32(sum));
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }
 
 static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 85f6c1e5b1..9509573939 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -17,7 +17,7 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
                               uint32x4_t *const sad_sum) {
@@ -28,11 +28,11 @@ static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
 static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[4], int ref_stride,
                                    uint32_t res[4], int h) {
-  uint32x4_t res0, res1;
   uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
                            vdupq_n_u32(0) };
   uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
                            vdupq_n_u32(0) };
+  uint32x4_t sum[4];
 
   int i = 0;
   do {
@@ -65,21 +65,22 @@ static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
     i++;
   } while (i < h);
 
-  res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]),
-                    vaddq_u32(sum_lo[1], sum_hi[1]));
-  res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]),
-                    vaddq_u32(sum_lo[3], sum_hi[3]));
-  vst1q_u32(res, vpaddq_u32(res0, res1));
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }
 
 static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[4], int ref_stride,
                                    uint32_t res[4], int h) {
-  uint32x4_t res0, res1;
   uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
                            vdupq_n_u32(0) };
   uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
                            vdupq_n_u32(0) };
+  uint32x4_t sum[4];
 
   int i = 0;
   do {
@@ -100,17 +101,17 @@ static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
     i++;
   } while (i < h);
 
-  res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]),
-                    vaddq_u32(sum_lo[1], sum_hi[1]));
-  res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]),
-                    vaddq_u32(sum_lo[3], sum_hi[3]));
-  vst1q_u32(res, vpaddq_u32(res0, res1));
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }
 
 static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[4], int ref_stride,
                                    uint32_t res[4], int h) {
-  uint32x4_t res0, res1;
   uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
                         vdupq_n_u32(0) };
 
@@ -125,12 +126,10 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
     i++;
   } while (i < h);
 
-  res0 = vpaddq_u32(sum[0], sum[1]);
-  res1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(res, vpaddq_u32(res0, res1));
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else  // !defined(__ARM_FEATURE_DOTPROD))
 
 static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
                               uint16x8_t *const sad_sum) {
@@ -246,7 +245,7 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
   res[3] = horizontal_add_uint16x8(sum[3]);
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
                              uint16x8_t *const sad_sum) {
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 5f20f9d99a..21560837ae 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -94,4 +94,20 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
   return vget_lane_u32(c, 0);
 #endif
 }
+
+static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
+#if defined(__aarch64__)
+  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+  return vpaddq_u32(res01, res23);
+#else
+  uint32x4_t res = vdupq_n_u32(0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_

From 03ddac40dfc9533a3038219b0bc819a5e6375227 Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Tue, 7 Feb 2023 11:11:35 -0800
Subject: [PATCH 522/926] Enable some speed features on speed 0

Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|---------|----------|----------|---------|-------|
|    0    | hdres2  | +0.069%  | +0.067%  | +0.100% | -8.6% |
|    0    | midres2 | +0.116%  | +0.103%  | +0.062% | -9.6% |
|    0    | lowres2 | +0.276%  | +0.283%  | +0.214% |-11.9% |

STATS_CHANGED

Change-Id: I8b26c0be2312fcd0f8c9e889367682e80ea8de4b
---
 test/vp9_ext_ratectrl_test.cc    |  2 +-
 vp9/encoder/vp9_speed_features.c | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 2bfa6281d7..739c0b7f8e 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -41,7 +41,7 @@ constexpr int kDefaultMaxGfInterval = 16;
 constexpr int kReadMinGfInterval = 5;
 constexpr int kReadMaxGfInterval = 13;
 const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
-const double kPsnrThreshold = 30.50;
+const double kPsnrThreshold = 30.4;
 
 struct ToyRateCtrl {
   int magic_number;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0431d8a452..6ce90c0887 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -209,15 +209,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   const int boosted = frame_is_boosted(cpi);
   int i;
 
-  sf->tx_size_search_breakout = 1;
+  sf->adaptive_pred_interp_filter = 1;
   sf->adaptive_rd_thresh = 1;
   sf->adaptive_rd_thresh_row_mt = 0;
   sf->allow_skip_recode = 1;
   sf->less_rectangular_check = 1;
-  sf->use_square_partition_only = !boosted;
+  sf->mv.auto_mv_step_size = 1;
   sf->prune_ref_frame_for_rect_partitions = 1;
-  sf->rd_ml_partition.var_pruning = 1;
+  sf->temporal_filter_search_method = NSTEP;
+  sf->tx_size_search_breakout = 1;
+  sf->use_square_partition_only = !boosted;
 
+  sf->rd_ml_partition.var_pruning = 1;
   sf->rd_ml_partition.prune_rect_thresh[0] = -1;
   sf->rd_ml_partition.prune_rect_thresh[1] = 350;
   sf->rd_ml_partition.prune_rect_thresh[2] = 325;
@@ -238,7 +241,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   }
 
   if (speed >= 1) {
-    sf->temporal_filter_search_method = NSTEP;
     sf->rd_ml_partition.var_pruning = !boosted;
     sf->rd_ml_partition.prune_rect_thresh[1] = 225;
     sf->rd_ml_partition.prune_rect_thresh[2] = 225;
@@ -263,11 +265,9 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->less_rectangular_check = 1;
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
-    sf->mv.auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 2;
     sf->mv.subpel_search_level = 1;
     if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
-    sf->adaptive_pred_interp_filter = 1;
     sf->allow_acl = 0;
 
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;

From d6eb9696aa72473c1a11d34d928d35a3acc0c9a9 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 30 Jan 2023 11:51:58 -0800
Subject: [PATCH 523/926] Fix unsigned integer overflow in sse computation

Basically port the fix from libaom:
https://aomedia-review.googlesource.com/c/aom/+/169361

Change-Id: Id06a5db91372037832399200ded75d514e096726
(cherry picked from commit a94cdd57ffd95ee7beb48d2794dae538f25da46c)
---
 vpx_dsp/psnr.c | 67 ++++++++++++++++++--------------------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
index 48bac04508..f0d4e927ae 100644
--- a/vpx_dsp/psnr.c
+++ b/vpx_dsp/psnr.c
@@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
  * and highbd_8_variance(). It should not.
  */
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int w, int h) {
   int i, j;
-
-  *sum = 0;
-  *sse = 0;
+  int64_t sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
 
     a += a_stride;
     b += b_stride;
   }
+
+  return sse;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int w,
+                                    int h) {
   int i, j;
+  int64_t sse = 0;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
     a += a_stride;
     b += b_stride;
   }
-}
 
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
-                            &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
+  return sse;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   const int dw = width % 16;
   const int dh = height % 16;
   int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
   int x, y;
 
   if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                             dw, height);
   }
 
   if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
+    total_sse +=
+        encoder_sse(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride, width - dw, dh);
   }
 
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
+    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
       vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
@@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   int x, y;
   const int dw = width % 16;
   const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
   if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
-                              b_stride, dw, height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
+                                      b_stride, dw, height);
   }
   if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
+                                      &b[(height - dh) * b_stride], b_stride,
+                                      width - dw, dh);
   }
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
+    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
       vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;

From bb065c6c6dbdfc24678e926501e9db13afb2ec12 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 8 Feb 2023 17:05:25 +0000
Subject: [PATCH 524/926] Add missing high bitdepth Neon subpel variance tests

Add missing 4x4 and 4x8 tests for both high bitdepth sub-pixel variance
and high bitdepth averaging sub-pixel variance.

Change-Id: I042752c5b7ccc14f58075694d0bb1d36f144ad06
---
 test/variance_test.cc | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index a6c8ef0480..33f09209f4 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1572,6 +1572,10 @@ INSTANTIATE_TEST_SUITE_P(
                              12),
         SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
                              12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
+                             12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
+                             12),
         SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
                              10),
         SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
@@ -1594,6 +1598,10 @@ INSTANTIATE_TEST_SUITE_P(
                              10),
         SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
                              10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
+                             10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
+                             10),
         SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
                              8),
         SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
@@ -1613,7 +1621,9 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
                              8),
         SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
-        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon,
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
                              8)));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -1652,6 +1662,12 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
                                 12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
+                                12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
+                                12),
         SubpelAvgVarianceParams(6, 6,
                                 &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
                                 10),
@@ -1685,6 +1701,12 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
                                 10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
+                                10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
+                                10),
         SubpelAvgVarianceParams(6, 6,
                                 &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
                                 8),
@@ -1717,6 +1739,12 @@ INSTANTIATE_TEST_SUITE_P(
                                 8),
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
                                 8)));
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH

From b6951d2b0f44c6e52e981929a2632022b101dc1c Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Wed, 8 Feb 2023 14:01:19 -0800
Subject: [PATCH 525/926] Copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic

STATS_CHANGED

BUG=webm:1789

Change-Id: I74efe28bdf90a179c59fe3d1f5a15d497f57080d
---
 vp9/encoder/vp9_rdopt.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d9b031cdc8..498bc0fbd5 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -4005,13 +4005,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
-// If adaptive interp filter is enabled, then the current leaf node of 8x8
-// data is needed for sub8x8. Hence preserve the context.
-#if CONFIG_CONSISTENT_RECODE
+    // If adaptive interp filter is enabled, then the current leaf node of 8x8
+    // data is needed for sub8x8. Hence preserve the context.
     if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#else
-    if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#endif
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;

From 459cfc8bae26afde6a16421b6f0e5ff5269ebb80 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 9 Feb 2023 11:57:10 +0000
Subject: [PATCH 526/926] Optimize Neon high bitdepth convolve copy

Use standard loads and stores instead of the significantly slower
interleaving/de-interleaving variants. Also move all loads in loop
bodies above all stores as a mitigation against the compiler thinking
that the src and dst pointers alias (since we can't use restrict in
C89.)

Change-Id: Idd59dca51387f553f8db27144a2b8f2377c937d3
---
 vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c | 106 +++++++++++---------
 1 file changed, 59 insertions(+), 47 deletions(-)

diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index 9d2752e097..7751082083 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
   (void)bd;
 
   if (w < 8) {  // copy4
+    uint16x4_t s0, s1;
     do {
-      vst1_u16(dst, vld1_u16(src));
+      s0 = vld1_u16(src);
       src += src_stride;
-      dst += dst_stride;
-      vst1_u16(dst, vld1_u16(src));
+      s1 = vld1_u16(src);
       src += src_stride;
+
+      vst1_u16(dst, s0);
+      dst += dst_stride;
+      vst1_u16(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // copy8
+    uint16x8_t s0, s1;
     do {
-      vst1q_u16(dst, vld1q_u16(src));
+      s0 = vld1q_u16(src);
       src += src_stride;
-      dst += dst_stride;
-      vst1q_u16(dst, vld1q_u16(src));
+      s1 = vld1q_u16(src);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      dst += dst_stride;
+      vst1q_u16(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // copy16
+    uint16x8_t s0, s1, s2, s3;
     do {
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
       src += src_stride;
-      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
+      s2 = vld1q_u16(src);
+      s3 = vld1q_u16(src + 8);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
       dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
+      vst1q_u16(dst, s2);
+      vst1q_u16(dst + 8, s3);
       dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+      h -= 2;
+    } while (h != 0);
   } else if (w == 32) {  // copy32
+    uint16x8_t s0, s1, s2, s3;
     do {
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
       dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+    } while (--h != 0);
   } else {  // copy64
+    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
     do {
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      s4 = vld1q_u16(src + 32);
+      s5 = vld1q_u16(src + 40);
+      s6 = vld1q_u16(src + 48);
+      s7 = vld1q_u16(src + 56);
       src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
+      vst1q_u16(dst + 32, s4);
+      vst1q_u16(dst + 40, s5);
+      vst1q_u16(dst + 48, s6);
+      vst1q_u16(dst + 56, s7);
+      dst += dst_stride;
+    } while (--h != 0);
   }
 }

From 5edaa583e14473464b86c6a5ab29fb793bd805e7 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 7 Feb 2023 17:22:12 -0500
Subject: [PATCH 527/926] Remove onyx_int.h from vp8 rc header

Also move the FRAME_TYPE declaration to common.h

Bug: webm:1766

Change-Id: Ic3016bd16548a5d2e0ae828a7fd7ad8adda8b8f6
---
 test/vp8_ratectrl_rtc_test.cc   |  7 ++++---
 test/vp9_ratectrl_rtc_test.cc   | 15 +++++++++------
 vp8/vp8_ratectrl_rtc.cc         | 11 ++++++++++-
 vp8/vp8_ratectrl_rtc.h          | 20 ++++++++------------
 vp9/ratectrl_rtc.cc             | 25 ++++++++++++++++++++++++-
 vp9/ratectrl_rtc.h              | 31 +++++--------------------------
 vpx/internal/vpx_ratectrl_rtc.h |  3 +++
 7 files changed, 63 insertions(+), 49 deletions(-)

diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index 7410f3c01d..56c26a99f4 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -127,14 +127,15 @@ class Vp8RcInterfaceTest
         encoder->Control(VP8E_SET_CPUUSED, -6);
         encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
         encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
-      } else if (frame_params_.frame_type == INTER_FRAME) {
+      } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
         // Disable golden frame update.
         frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
         frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
       }
     }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
     encoder_exit_ = video->frame() == test_video_.frames;
   }
 
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 1d1a78f43d..cce73fcce2 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -57,9 +57,11 @@ class RcInterfaceTest
       encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
       encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
     }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
-    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    if (rc_cfg_.rc_mode == VPX_CBR &&
+        frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
       // Disable golden frame update.
       frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
       frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
@@ -183,8 +185,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
     }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
     encoder_exit_ = video->frame() == kNumFrames;
     current_superframe_ = video->frame();
     if (dynamic_spatial_layers_ == 1) {
@@ -247,7 +250,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
           else
             frame_params_.temporal_layer_id = 0;
           rc_api_->ComputeQP(frame_params_);
-          frame_params_.frame_type = INTER_FRAME;
+          frame_params_.frame_type = libvpx::RcFrameType::kInterFrame;
           rc_api_->PostEncodeUpdate(sizes_[sl]);
         }
       }
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index f3f42529db..c36cfea485 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -10,7 +10,9 @@
 
 #include <math.h>
 #include <new>
+#include "vp8/common/common.h"
 #include "vp8/vp8_ratectrl_rtc.h"
+#include "vp8/encoder/onyx_int.h"
 #include "vp8/encoder/ratectrl.h"
 #include "vpx_ports/system_state.h"
 
@@ -65,6 +67,13 @@ std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
   return rc_api;
 }
 
+VP8RateControlRTC::~VP8RateControlRTC() {
+  if (cpi_) {
+    vpx_free(cpi_->gf_active_flags);
+    vpx_free(cpi_);
+  }
+}
+
 void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
   VP8_COMMON *cm = &cpi_->common;
   VP8_CONFIG *oxcf = &cpi_->oxcf;
@@ -203,7 +212,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
     vp8_restore_layer_context(cpi_, layer);
     vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
   }
-  cm->frame_type = frame_params.frame_type;
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
   cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
   cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
   if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index def7dd8f9e..0e81592eca 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -12,23 +12,24 @@
 #define VPX_VP8_RATECTRL_RTC_H_
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 
-#include "vp8/encoder/onyx_int.h"
-#include "vp8/common/common.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"
 
+struct VP8_COMP;
+
 namespace libvpx {
 struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
  public:
   VP8RateControlRtcConfig() {
-    vp8_zero(layer_target_bitrate);
-    vp8_zero(ts_rate_decimator);
+    memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+    memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
   }
 };
 
 struct VP8FrameParamsQpRTC {
-  FRAME_TYPE frame_type;
+  RcFrameType frame_type;
   int temporal_layer_id;
 };
 
@@ -36,12 +37,7 @@ class VP8RateControlRTC {
  public:
   static std::unique_ptr<VP8RateControlRTC> Create(
       const VP8RateControlRtcConfig &cfg);
-  ~VP8RateControlRTC() {
-    if (cpi_) {
-      vpx_free(cpi_->gf_active_flags);
-      vpx_free(cpi_);
-    }
-  }
+  ~VP8RateControlRTC();
 
   void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
@@ -54,7 +50,7 @@ class VP8RateControlRTC {
  private:
   VP8RateControlRTC() {}
   void InitRateControl(const VP8RateControlRtcConfig &cfg);
-  VP8_COMP *cpi_;
+  struct VP8_COMP *cpi_;
   int q_;
 };
 
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 02e50a857c..944c526ac1 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -48,6 +48,29 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
   return rc_api;
 }
 
+VP9RateControlRTC::~VP9RateControlRTC() {
+  if (cpi_) {
+    if (cpi_->svc.number_spatial_layers > 1 ||
+        cpi_->svc.number_temporal_layers > 1) {
+      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+          int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
+          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+          vpx_free(lc->map);
+          vpx_free(lc->last_coded_q_map);
+          vpx_free(lc->consec_zero_mv);
+        }
+      }
+    }
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vpx_free(cpi_->segmentation_map);
+      cpi_->segmentation_map = NULL;
+      vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
+    }
+    vpx_free(cpi_);
+  }
+}
+
 void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   VP9_COMMON *cm = &cpi_->common;
   VP9EncoderConfig *oxcf = &cpi_->oxcf;
@@ -157,7 +180,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
     cm->height = height;
   }
   vp9_set_mb_mi(cm, cm->width, cm->height);
-  cm->frame_type = frame_params.frame_type;
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
   // This is needed to ensure key frame does not get unset in rc_get_svc_params.
   cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
   cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index b209e4db66..162a04883e 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -19,14 +19,14 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/vp9_iface_common.h"
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/vp9_cx_iface.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"
 #include "vpx_mem/vpx_mem.h"
 
-namespace libvpx {
+struct VP9_COMP;
 
+namespace libvpx {
 struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
  public:
   VP9RateControlRtcConfig() {
@@ -53,7 +53,7 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
 };
 
 struct VP9FrameParamsQpRTC {
-  FRAME_TYPE frame_type;
+  RcFrameType frame_type;
   int spatial_layer_id;
   int temporal_layer_id;
 };
@@ -90,28 +90,7 @@ class VP9RateControlRTC {
  public:
   static std::unique_ptr<VP9RateControlRTC> Create(
       const VP9RateControlRtcConfig &cfg);
-  ~VP9RateControlRTC() {
-    if (cpi_) {
-      if (cpi_->svc.number_spatial_layers > 1 ||
-          cpi_->svc.number_temporal_layers > 1) {
-        for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
-          for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
-            int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
-            LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
-            vpx_free(lc->map);
-            vpx_free(lc->last_coded_q_map);
-            vpx_free(lc->consec_zero_mv);
-          }
-        }
-      }
-      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-        vpx_free(cpi_->segmentation_map);
-        cpi_->segmentation_map = NULL;
-        vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
-      }
-      vpx_free(cpi_);
-    }
-  }
+  ~VP9RateControlRTC();
 
   void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
@@ -125,7 +104,7 @@ class VP9RateControlRTC {
  private:
   VP9RateControlRTC() {}
   void InitRateControl(const VP9RateControlRtcConfig &cfg);
-  VP9_COMP *cpi_;
+  struct VP9_COMP *cpi_;
 };
 
 }  // namespace libvpx
diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h
index 65398c654d..33c57e219b 100644
--- a/vpx/internal/vpx_ratectrl_rtc.h
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -14,6 +14,9 @@
 #include "vpx/vpx_encoder.h"
 
 namespace libvpx {
+
+enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 };
+
 struct VpxRateControlRtcConfig {
  public:
   VpxRateControlRtcConfig() {

From 086f0e653893bf1fa15f5d78592ac96372c9ccd4 Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Wed, 8 Feb 2023 13:54:46 -0800
Subject: [PATCH 528/926] Remove CONFIG_CONSISTENT_RECODE flag

Currently, libvpx does not properly clear and re-initialize the memories
when it re-encodes a frame. As a result, out-of-date values are used in
the encoding process, and re-encoding a frame with the same parameter
will give different outputs.

This commit enables the code under CONFIG_CONSISTENT_RECODE to correct
this behavior. This change has minor effect on the coding performance,
but it ensures valid values are used in the encoding process.

Furthermore, the flag is removed as it is now always turned on.

Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|---------|----------|----------|---------|-------|
|    0    | hdres2  | -0.012%  | -0.021%  | -0.030% | +0.1% |
|    0    | lowres2 | +0.029%  | +0.019%  | +0.047% | +0.1% |
|    0    | midres2 | -0.004%  | +0.009%  | +0.026% | +0.1% |
|---------|---------|----------|----------|---------|-------|
|    1    | hdres2  | +0.032%  | +0.032%  | -0.000% | -0.0% |
|    1    | lowres2 | -0.005%  | -0.011%  | -0.014% | +0.0% |
|    1    | midres2 | +0.004%  | +0.020%  | +0.027% | +0.2% |
|---------|---------|----------|----------|---------|-------|
|    2    | hdres2  | +0.048%  | +0.056%  | +0.057% | +0.1% |
|    2    | lowres2 | +0.007%  | +0.002%  | -0.016% | -0.0% |
|    2    | midres2 | -0.015%  | -0.008%  | -0.002% | +0.1% |
|---------|---------|----------|----------|---------|-------|
|    3    | hdres2  | +0.010%  | +0.014%  | +0.004% | -0.0% |
|    3    | lowres2 | +0.000%  | -0.021%  | -0.001% | +0.0% |
|    3    | midres2 | +0.007%  | -0.038%  | +0.012% | -0.2% |
|---------|---------|----------|----------|---------|-------|
|    4    | hdres2  | +0.107%  | +0.136%  | +0.124% | -0.0% |
|    4    | lowres2 | -0.012%  | -0.024%  | -0.020% | -0.0% |
|    4    | midres2 | +0.055%  | -0.004%  | +0.048% | -0.1% |
|---------|---------|----------|----------|---------|-------|
|    5    | hdres2  | +0.026%  | +0.027%  | +0.020% | -0.0% |
|    5    | lowres2 | +0.009%  | -0.008%  | +0.028% | +0.1% |
|    5    | midres2 | -0.025%  | +0.021%  | -0.020% | -0.1% |

STATS_CHANGED

Change-Id: I3967aee8c8e4d0608a492e07f99ab8de9744ba57
---
 configure                     |  2 --
 vp9/encoder/vp9_encodeframe.c | 39 ++++++-----------------------------
 vp9/encoder/vp9_encoder.c     | 30 ++++++---------------------
 vp9/encoder/vp9_encoder.h     |  2 --
 vp9/encoder/vp9_rd.h          |  2 --
 5 files changed, 12 insertions(+), 63 deletions(-)

diff --git a/configure b/configure
index 18f0ea798b..890ad3968a 100755
--- a/configure
+++ b/configure
@@ -343,7 +343,6 @@ CONFIG_LIST="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
-    consistent_recode
     coefficient_range_checking
     vp9_highbitdepth
     better_hw_compatibility
@@ -407,7 +406,6 @@ CMDLINE_SELECT="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
-    consistent_recode
     coefficient_range_checking
     better_hw_compatibility
     vp9_highbitdepth
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a22c00bd8f..a522097e61 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -5841,14 +5841,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
-#if CONFIG_RATE_CTRL
-            if (cpi->oxcf.use_simple_encode_api) {
-              tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-            }
-#endif  // CONFIG_RATE_CTRL
-#if CONFIG_CONSISTENT_RECODE
             tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-#endif  // CONFIG_CONSISTENT_RECODE
             tile_data->mode_map[i][j] = j;
           }
         }
@@ -6068,9 +6061,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-#if CONFIG_CONSISTENT_RECODE
   x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
-#endif
   if (xd->lossless) x->optimize = 0;
   x->sharpness = cpi->oxcf.sharpness;
   x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
@@ -6215,13 +6206,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
   return sum_delta / (cm->mi_rows * cm->mi_cols);
 }
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 static void restore_encode_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  int tile_col, tile_row;
+  int tile_idx;
   int i, j;
+  TileDataEnc *tile_data;
   RD_OPT *rd_opt = &cpi->rd;
   for (i = 0; i < MAX_REF_FRAMES; i++) {
     for (j = 0; j < REFERENCE_MODES; j++)
@@ -6232,35 +6221,19 @@ static void restore_encode_params(VP9_COMP *cpi) {
       rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
   }
 
-  if (cpi->tile_data != NULL) {
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        for (i = 0; i < BLOCK_SIZES; ++i) {
-          for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact[i][j] =
-                tile_data->thresh_freq_fact_prev[i][j];
-          }
-        }
-      }
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
   }
 
   cm->interp_filter = cpi->sf.default_interp_filter;
 }
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
-#if CONFIG_RATE_CTRL
-  if (cpi->oxcf.use_simple_encode_api) {
-    restore_encode_params(cpi);
-  }
-#endif  // CONFIG_RATE_CTRL
-#if CONFIG_CONSISTENT_RECODE
   restore_encode_params(cpi);
-#endif
 
 #if CONFIG_MISMATCH_DEBUG
   mismatch_reset_frame(MAX_MB_PLANE);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 5b895c2814..22fbb899fd 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3774,13 +3774,10 @@ static void set_frame_size(VP9_COMP *cpi) {
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 static void save_encode_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  int tile_col, tile_row;
+  int tile_idx;
   int i, j;
+  TileDataEnc *tile_data;
   RD_OPT *rd_opt = &cpi->rd;
   for (i = 0; i < MAX_REF_FRAMES; i++) {
     for (j = 0; j < REFERENCE_MODES; j++)
@@ -3791,21 +3788,12 @@ static void save_encode_params(VP9_COMP *cpi) {
       rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j];
   }
 
-  if (cpi->tile_data != NULL) {
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        for (i = 0; i < BLOCK_SIZES; ++i) {
-          for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact_prev[i][j] =
-                tile_data->thresh_freq_fact[i][j];
-          }
-        }
-      }
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact);
   }
 }
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 
 static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
 #ifdef ENABLE_KF_DENOISE
@@ -5484,14 +5472,8 @@ static void encode_frame_to_data_rate(
   memset(cpi->mode_chosen_counts, 0,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
-#if CONFIG_CONSISTENT_RECODE
   // Backup to ensure consistency between recodes
   save_encode_params(cpi);
-#elif CONFIG_RATE_CTRL
-  if (cpi->oxcf.use_simple_encode_api) {
-    save_encode_params(cpi);
-  }
-#endif
   if (cpi->ext_ratectrl.ready &&
       (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) {
     vpx_codec_err_t codec_status;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index e17845d065..79c0b36a17 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -329,9 +329,7 @@ typedef struct TplDepFrame {
 typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int8_t mode_map[BLOCK_SIZES][MAX_MODES];
   FIRSTPASS_DATA fp_data;
   VP9RowMTSync row_mt_sync;
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index d2bc5e60ed..efd854edf4 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -121,11 +121,9 @@ typedef struct RD_OPT {
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int RDMULT;
   int RDDIV;
   double r0;

From 184a886917529e8a9d23ab564b05b0cc13e29f2b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 10 Feb 2023 19:04:41 -0800
Subject: [PATCH 529/926] README: update release version to 1.13.0

this was missed in the v1.13.0 tag

Bug: webm:1780
Change-Id: I3044534123bf67861174970e6241f6586055358e
---
 README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README b/README
index 477a145ba3..e360df05f6 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-v1.12.0 Torrent Duck
+v1.13.0 Ugly Duckling
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 

From b5e1945af0e3f33ab4ab5fc5175da5505d6a67cd Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 10 Feb 2023 19:04:41 -0800
Subject: [PATCH 530/926] README: update release version to 1.13.0

this was missed in the v1.13.0 tag

Bug: webm:1780
Change-Id: I3044534123bf67861174970e6241f6586055358e
(cherry picked from commit 184a886917529e8a9d23ab564b05b0cc13e29f2b)
---
 README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README b/README
index 477a145ba3..e360df05f6 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-v1.12.0 Torrent Duck
+v1.13.0 Ugly Duckling
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 

From 42cb3dbf94706dd8477f9313ceba1fc3a9a14e92 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 7 Feb 2023 14:08:33 +0000
Subject: [PATCH 531/926] Optimize Neon high bitdepth subpel variance functions

Use the same general code style as in the standard bitdepth Neon
implementation. Additionally, do not unnecessarily widen to 32-bit data
types when doing bilinear filtering - allowing us to process twice as
many elements per instruction.

Change-Id: I1e178991d2aa71f5f77a376e145d19257481e90f
---
 vpx_dsp/arm/highbd_subpel_variance_neon.c | 307 ++++++++++++++++++++++
 vpx_dsp/arm/highbd_variance_neon.c        | 288 +-------------------
 vpx_dsp/arm/mem_neon.h                    |  14 +
 vpx_dsp/vpx_dsp.mk                        |   1 +
 4 files changed, 336 insertions(+), 274 deletions(-)
 create mode 100644 vpx_dsp/arm/highbd_subpel_variance_neon.c

diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000000..81943ee4a6
--- /dev/null
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,307 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
+//  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
+    uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
+
+    uint16x8_t blend = vmulq_u16(s0, f0);
+    blend = vmlaq_u16(blend, s1, f1);
+    blend = vrshrq_n_u16(blend, 3);
+
+    vst1q_u16(dst_ptr, blend);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 8;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+                                                uint16_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, blend);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      8, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      16, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      32, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      64, dst_height, filter_offset);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
+  unsigned int vpx_highbd_8_sub_pixel_variance##w##x##h##_neon(              \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
+    uint16_t tmp0[w * (h + padding)];                                        \
+    uint16_t tmp1[w * h];                                                    \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
+                                                                             \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
+                                       (h + padding), xoffset);              \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
+                                                                             \
+    return vpx_highbd_8_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref,  \
+                                          ref_stride, sse);                  \
+  }                                                                          \
+                                                                             \
+  unsigned int vpx_highbd_10_sub_pixel_variance##w##x##h##_neon(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
+    uint16_t tmp0[w * (h + padding)];                                        \
+    uint16_t tmp1[w * h];                                                    \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
+                                                                             \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
+                                       (h + padding), xoffset);              \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
+                                                                             \
+    return vpx_highbd_10_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \
+                                           ref_stride, sse);                 \
+  }                                                                          \
+  unsigned int vpx_highbd_12_sub_pixel_variance##w##x##h##_neon(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
+    uint16_t tmp0[w * (h + padding)];                                        \
+    uint16_t tmp1[w * h];                                                    \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
+                                                                             \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
+                                       (h + padding), xoffset);              \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
+                                                                             \
+    return vpx_highbd_12_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \
+                                           ref_stride, sse);                 \
+  }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+HBD_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  uint32x4_t one_u32 = vdupq_n_u32(1);
+  if (width >= 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 8) {
+        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
+        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
+        const uint32x4_t sum1_u32 =
+            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
+        const uint32x4_t sum2_u32 =
+            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
+        const uint16x4_t sum1_u16 =
+            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
+        const uint16x4_t sum2_u16 =
+            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
+        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
+        vst1q_u16(&comp_pred[j], vcomp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    assert(width >= 4);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 4) {
+        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
+        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
+        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
+        const uint16x4_t vcomp_pred =
+            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
+        vst1_u16(&comp_pred[j], vcomp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  }
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                            \
+  uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_neon(                \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t tmp0[w * (h + padding)];                                          \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
+                                       (h + padding), xoffset);                \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w,   \
+                                  h, tmp1, w);                                 \
+                                                                               \
+    return vpx_highbd_8_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w,  \
+                                                 ref, ref_stride, sse);        \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_neon(               \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t tmp0[w * (h + padding)];                                          \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
+                                       (h + padding), xoffset);                \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w,   \
+                                  h, tmp1, w);                                 \
+                                                                               \
+    return vpx_highbd_10_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \
+                                                  ref, ref_stride, sse);       \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_neon(               \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t tmp0[w * (h + padding)];                                          \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
+                                       (h + padding), xoffset);                \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w,   \
+                                  h, tmp1, w);                                 \
+                                                                               \
+    return vpx_highbd_12_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \
+                                                  ref, ref_stride, sse);       \
+  }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
index 96a35af01c..985cc35682 100644
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -18,11 +18,6 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-static const uint8_t bilinear_filters[8][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
-
 static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
                                      const uint16_t *ref_ptr, int ref_stride,
                                      int w, int h, uint64_t *sse,
@@ -136,7 +131,7 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 }
 
-#define HIGHBD_VAR(W, H)                                                    \
+#define HBD_VARIANCE_WXH_NEON(W, H)                                         \
   uint32_t vpx_highbd_8_variance##W##x##H##_neon(                           \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
       int ref_stride, uint32_t *sse) {                                      \
@@ -218,274 +213,19 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
     return *sse;                                                            \
   }
 
-static INLINE void highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter) {
-  uint32_t i, j;
-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
-
-  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
-  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
-  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
-        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
-        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
-        uint16x4_t out1_u16;
-        uint16x4_t out2_u16;
-        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
-        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
-        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
-        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
-        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  } else {
-    assert(output_width >= 4);
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 4) {
-        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
-        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
-        uint16x4_t out_u16;
-        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
-        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
-        vst1_u16(&output_ptr[j], out_u16);
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  }
-}
-
-static INLINE void highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, unsigned int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter) {
-  uint32_t i, j;
-
-  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
-  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
-  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
-        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
-        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
-        uint16x4_t out1_u16;
-        uint16x4_t out2_u16;
-        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
-        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
-        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
-        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
-        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  } else {
-    assert(output_width >= 4);
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 4) {
-        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
-        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
-        uint16x4_t out_u16;
-        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
-        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
-        vst1_u16(&output_ptr[j], out_u16);
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  }
-}
-
-#define HIGHBD_SUBPIX_VAR(W, H)                                                \
-  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon(                    \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
-                                                 ref_ptr, ref_stride, sse);    \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon(                   \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_10_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon(                   \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_12_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
-  }
-
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon(                \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
-                                                 ref_ptr, ref_stride, sse);    \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon(               \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_10_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon(               \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_12_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
-  }
-
-void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
-                                   int width, int height, const uint16_t *ref,
-                                   int ref_stride) {
-  int i, j;
-  uint32x4_t one_u32 = vdupq_n_u32(1);
-  if (width >= 8) {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 8) {
-        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
-        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
-        const uint32x4_t sum1_u32 =
-            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
-        const uint32x4_t sum2_u32 =
-            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
-        const uint16x4_t sum1_u16 =
-            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
-        const uint16x4_t sum2_u16 =
-            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
-        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
-        vst1q_u16(&comp_pred[j], vcomp_pred);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += ref_stride;
-    }
-  } else {
-    assert(width >= 4);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 4) {
-        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
-        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
-        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
-        const uint16x4_t vcomp_pred =
-            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
-        vst1_u16(&comp_pred[j], vcomp_pred);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += ref_stride;
-    }
-  }
-}
-
-/* All three forms of the variance are available in the same sizes. */
-#define HIGHBD_VARIANCES(W, H) \
-  HIGHBD_VAR(W, H)             \
-  HIGHBD_SUBPIX_VAR(W, H)      \
-  HIGHBD_SUBPIX_AVG_VAR(W, H)
-
-HIGHBD_VARIANCES(64, 64)
-HIGHBD_VARIANCES(64, 32)
-HIGHBD_VARIANCES(32, 64)
-HIGHBD_VARIANCES(32, 32)
-HIGHBD_VARIANCES(32, 16)
-HIGHBD_VARIANCES(16, 32)
-HIGHBD_VARIANCES(16, 16)
-HIGHBD_VARIANCES(16, 8)
-HIGHBD_VARIANCES(8, 16)
-HIGHBD_VARIANCES(8, 8)
-HIGHBD_VARIANCES(8, 4)
-HIGHBD_VARIANCES(4, 8)
-HIGHBD_VARIANCES(4, 4)
+HBD_VARIANCE_WXH_NEON(64, 64)
+HBD_VARIANCE_WXH_NEON(64, 32)
+HBD_VARIANCE_WXH_NEON(32, 64)
+HBD_VARIANCE_WXH_NEON(32, 32)
+HBD_VARIANCE_WXH_NEON(32, 16)
+HBD_VARIANCE_WXH_NEON(16, 32)
+HBD_VARIANCE_WXH_NEON(16, 16)
+HBD_VARIANCE_WXH_NEON(16, 8)
+HBD_VARIANCE_WXH_NEON(8, 16)
+HBD_VARIANCE_WXH_NEON(8, 8)
+HBD_VARIANCE_WXH_NEON(8, 4)
+HBD_VARIANCE_WXH_NEON(4, 8)
+HBD_VARIANCE_WXH_NEON(4, 4)
 
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 19cfc7c7f2..866be7439e 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -126,6 +126,20 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
   return vreinterpret_u8_u32(a_u32);
 }
 
+// Load 2 sets of 8 bytes when alignment is not guaranteed.
+static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
+                                             ptrdiff_t stride) {
+  uint64_t a;
+  uint64x2_t a_u64;
+  if (stride == 4) return vld1q_u16(buf);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  a_u64 = vdupq_n_u64(a);
+  memcpy(&a, buf, 8);
+  a_u64 = vsetq_lane_u64(a, a_u64, 1);
+  return vreinterpretq_u16_u64(a_u64);
+}
+
 // Store 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
                                       const uint8x8_t a) {
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 3b04e97651..f10b7cc208 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -434,6 +434,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 

From 7343d56c1bf2f32b3fe2127cfcec1006f2fd95c6 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 8 Feb 2023 16:50:59 +0000
Subject: [PATCH 532/926] Refactor Neon high bitdepth avg subpel variance
 functions

Use the same general code style as in the standard bitdepth Neon
implementation - merging the computation of vpx_highbd_comp_avg_pred
with the second pass of the bilinear filter to avoid storing and loading
the block again.

Also move vpx_highbd_comp_avg_pred_neon to its own file (like the
standard bitdepth implementation) since we're no longer using it for
averaging sub-pixel variance.

Change-Id: I2f5916d5b397db44b3247b478ef57046797dae6c
---
 vpx_dsp/arm/highbd_avg_pred_neon.c        |  57 ++++++++
 vpx_dsp/arm/highbd_subpel_variance_neon.c | 150 ++++++++++++++--------
 vpx_dsp/vpx_dsp.mk                        |   1 +
 3 files changed, 152 insertions(+), 56 deletions(-)
 create mode 100644 vpx_dsp/arm/highbd_avg_pred_neon.c

diff --git a/vpx_dsp/arm/highbd_avg_pred_neon.c b/vpx_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..04dbaad7a3
--- /dev/null
+++ b/vpx_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  uint32x4_t one_u32 = vdupq_n_u32(1);
+  if (width >= 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 8) {
+        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
+        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
+        const uint32x4_t sum1_u32 =
+            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
+        const uint32x4_t sum2_u32 =
+            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
+        const uint16x4_t sum1_u16 =
+            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
+        const uint16x4_t sum2_u16 =
+            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
+        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
+        vst1q_u16(&comp_pred[j], vcomp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    assert(width >= 4);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 4) {
+        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
+        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
+        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
+        const uint16x4_t vcomp_pred =
+            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
+        vst1_u16(&comp_pred[j], vcomp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  }
+}
diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
index 81943ee4a6..07e8ea5107 100644
--- a/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -185,47 +185,91 @@ HBD_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
 HBD_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
 HBD_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
 
-void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
-                                   int width, int height, const uint16_t *ref,
-                                   int ref_stride) {
-  int i, j;
-  uint32x4_t one_u32 = vdupq_n_u32(1);
-  if (width >= 8) {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 8) {
-        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
-        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
-        const uint32x4_t sum1_u32 =
-            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
-        const uint32x4_t sum2_u32 =
-            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
-        const uint16x4_t sum1_u16 =
-            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
-        const uint16x4_t sum2_u16 =
-            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
-        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
-        vst1q_u16(&comp_pred[j], vcomp_pred);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += ref_stride;
-    }
-  } else {
-    assert(width >= 4);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 4) {
-        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
-        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
-        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
-        const uint16x4_t vcomp_pred =
-            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
-        vst1_u16(&comp_pred[j], vcomp_pred);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += ref_stride;
-    }
-  }
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
+    uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
+    uint16x8_t p = vld1q_u16(second_pred);
+
+    uint16x8_t blend = vmulq_u16(s0, f0);
+    blend = vmlaq_u16(blend, s1, f1);
+    blend = vrshrq_n_u16(blend, 3);
+
+    vst1q_u16(dst_ptr, vrhaddq_u16(blend, p));
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 8, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 16, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 32, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 64, dst_height,
+                                               filter_offset, second_pred);
 }
 
 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                            \
@@ -239,12 +283,10 @@ void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
                                                                                \
     highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
                                        (h + padding), xoffset);                \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
                                                                                \
-    vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w,   \
-                                  h, tmp1, w);                                 \
-                                                                               \
-    return vpx_highbd_8_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w,  \
+    return vpx_highbd_8_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w,  \
                                                  ref, ref_stride, sse);        \
   }                                                                            \
                                                                                \
@@ -258,12 +300,10 @@ void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
                                                                                \
     highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
                                        (h + padding), xoffset);                \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w,   \
-                                  h, tmp1, w);                                 \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
                                                                                \
-    return vpx_highbd_10_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \
+    return vpx_highbd_10_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \
                                                   ref, ref_stride, sse);       \
   }                                                                            \
                                                                                \
@@ -277,12 +317,10 @@ void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
                                                                                \
     highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
                                        (h + padding), xoffset);                \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w,   \
-                                  h, tmp1, w);                                 \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
                                                                                \
-    return vpx_highbd_12_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \
+    return vpx_highbd_12_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \
                                                   ref, ref_stride, sse);       \
   }
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index f10b7cc208..5535f82c07 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -433,6 +433,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH

From c113d6b027b0299abe31f05240e2895112517c91 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 9 Feb 2023 14:16:30 +0000
Subject: [PATCH 533/926] Specialize Neon high bitdepth subpel variance by
 filter value

Use the same specialization as for standard bitdepth. The rationale for
the specialization is as follows:

The optimal implementation of the bilinear interpolation depends on the
filter values being used. For both horizontal and vertical interpolation
this can simplify to just taking the source values, or averaging the
source and reference values - which can be computed more easily than a
bilinear interpolation with arbitrary filter values.

This patch introduces tests to find the most optimal bilinear
interpolation implementation based on the filter values being used.
This new specialization is only used for larger block sizes.

Change-Id: I73182c979255f0332a274f2e5907df7f38c9eeb3
---
 vpx_dsp/arm/highbd_subpel_variance_neon.c | 217 ++++++++++++++++------
 1 file changed, 161 insertions(+), 56 deletions(-)

diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
index 07e8ea5107..0682eb3a29 100644
--- a/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -120,70 +120,175 @@ static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
                                       64, dst_height, filter_offset);
 }
 
-#define HBD_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
-  unsigned int vpx_highbd_8_sub_pixel_variance##w##x##h##_neon(              \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
-    uint16_t tmp0[w * (h + padding)];                                        \
-    uint16_t tmp1[w * h];                                                    \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
-                                                                             \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
-                                       (h + padding), xoffset);              \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
-                                                                             \
-    return vpx_highbd_8_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref,  \
-                                          ref_stride, sse);                  \
-  }                                                                          \
-                                                                             \
-  unsigned int vpx_highbd_10_sub_pixel_variance##w##x##h##_neon(             \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
-    uint16_t tmp0[w * (h + padding)];                                        \
-    uint16_t tmp1[w * h];                                                    \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
-                                                                             \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
-                                       (h + padding), xoffset);              \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
-                                                                             \
-    return vpx_highbd_10_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \
-                                           ref_stride, sse);                 \
-  }                                                                          \
-  unsigned int vpx_highbd_12_sub_pixel_variance##w##x##h##_neon(             \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
-    uint16_t tmp0[w * (h + padding)];                                        \
-    uint16_t tmp1[w * h];                                                    \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
-                                                                             \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
-                                       (h + padding), xoffset);              \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
-                                                                             \
-    return vpx_highbd_12_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \
-                                           ref_stride, sse);                 \
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding)                  \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + padding)];                                          \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
+                                       (h + padding), xoffset);                \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding)      \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + padding));                          \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + padding));                          \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + padding), xoffset);            \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + padding), xoffset);            \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
   }
 
 // 4x<h> blocks are processed two rows at a time, so require an extra row of
 // padding.
-HBD_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
-HBD_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1)
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1)
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1)
 
 // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
 // width 4.

From e03217c9d543d2f1053b45e11f6c323794c80e9a Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 9 Feb 2023 16:45:01 +0000
Subject: [PATCH 534/926] Specialize Neon high bitdepth avg subpel variance by
 filter value

Use the same specialization as for standard bitdepth. The rationale for
the specialization is as follows:

The optimal implementation of the bilinear interpolation depends on the
filter values being used. For both horizontal and vertical interpolation
this can simplify to just taking the source values, or averaging the
source and reference values - which can be computed more easily than a
bilinear interpolation with arbitrary filter values.

This patch introduces tests to find the most optimal bilinear
interpolation implementation based on the filter values being used.
This new specialization is only used for larger block sizes.

Change-Id: Id5a2b2d9fac6f878795a6ed9de2bc27d9e62d661
---
 vpx_dsp/arm/highbd_subpel_variance_neon.c | 264 +++++++++++++++++-----
 1 file changed, 204 insertions(+), 60 deletions(-)

diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
index 0682eb3a29..aa64697458 100644
--- a/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -377,74 +377,218 @@ static void highbd_avg_pred_var_filter_block2d_bil_w64(
                                                filter_offset, second_pred);
 }
 
-#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                            \
-  uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_neon(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred) {                                            \
-    uint16_t tmp0[w * (h + padding)];                                          \
-    uint16_t tmp1[w * h];                                                      \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
-                                                                               \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
-                                       (h + padding), xoffset);                \
-    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
-        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
-                                                                               \
-    return vpx_highbd_8_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w,  \
-                                                 ref, ref_stride, sse);        \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_neon(               \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred) {                                            \
-    uint16_t tmp0[w * (h + padding)];                                          \
-    uint16_t tmp1[w * h];                                                      \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
-                                                                               \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
-                                       (h + padding), xoffset);                \
-    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
-        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
-                                                                               \
-    return vpx_highbd_10_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \
-                                                  ref, ref_stride, sse);       \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_neon(               \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+      uint16x8_t p = vld1q_u16(second_pred);
+      avg = vrhaddq_u16(avg, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                            int src_stride, int dst_width, int dst_height,
+                            const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = vrhaddq_u16(s, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding)          \
+  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                        \
+    uint16_t tmp0[w * (h + padding)];                                      \
+    uint16_t tmp1[w * h];                                                  \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                          \
+                                                                           \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                       (h + padding), xoffset);            \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+                                                                           \
+    return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding)  \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
       const uint8_t *second_pred) {                                            \
-    uint16_t tmp0[w * (h + padding)];                                          \
-    uint16_t tmp1[w * h];                                                      \
     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
                                                                                \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
-                                       (h + padding), xoffset);                \
-    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
-        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
-                                                                               \
-    return vpx_highbd_12_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \
-                                                  ref, ref_stride, sse);       \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
+                        CONVERT_TO_SHORTPTR(second_pred));                     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else if (yoffset == 4) {                                               \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp, source_stride, source_stride, w, h,                  \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp0, source_stride, 1, w, h,                             \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + padding));                          \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + padding));                          \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + padding), xoffset);            \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + padding), xoffset);            \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
   }
 
 // 4x<h> blocks are processed two rows at a time, so require an extra row of
 // padding.
-SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
-SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
 
-SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1)
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1)
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16, 1)
 
-SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32, 1)
 
-SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64, 1)
 
-SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
-SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64, 1)

From b17993ca673f284c99b789b207c80df50f8c4429 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 10 Feb 2023 10:29:24 +0000
Subject: [PATCH 535/926] Add Neon AvgPredTestHBD test suite

Add test suite for vpx_highbd_comp_avg_pred_neon.

Change-Id: I5c31e0e990661ee3b8030bb517829c088fceae4d
---
 test/comp_avg_pred_test.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 70aeab8d7e..f747c3524e 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -260,5 +260,11 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_neon>));
+#endif  // HAVE_NEON
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace

From ed68c267cfafbd5acc4fe9a9d61b27a9e70170de Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 10 Feb 2023 10:50:47 +0000
Subject: [PATCH 536/926] Optimize vpx_highbd_comp_avg_pred_neon

Optimize the implementation of vpx_highbd_comp_avg_pred_neon by making
use of the URHADD instruction to compute the average.

Change-Id: Id74a6d9c33e89bc548c3c7ecace59af69051b4a7
---
 vpx_dsp/arm/highbd_avg_pred_neon.c | 67 +++++++++++++++++-------------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/vpx_dsp/arm/highbd_avg_pred_neon.c b/vpx_dsp/arm/highbd_avg_pred_neon.c
index 04dbaad7a3..3063acbb3e 100644
--- a/vpx_dsp/arm/highbd_avg_pred_neon.c
+++ b/vpx_dsp/arm/highbd_avg_pred_neon.c
@@ -9,6 +9,7 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
@@ -16,42 +17,48 @@
 void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
                                    int width, int height, const uint16_t *ref,
                                    int ref_stride) {
-  int i, j;
-  uint32x4_t one_u32 = vdupq_n_u32(1);
-  if (width >= 8) {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 8) {
-        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
-        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
-        const uint32x4_t sum1_u32 =
-            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
-        const uint32x4_t sum2_u32 =
-            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
-        const uint16x4_t sum1_u16 =
-            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
-        const uint16x4_t sum2_u16 =
-            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
-        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
-        vst1q_u16(&comp_pred[j], vcomp_pred);
-      }
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
       comp_pred += width;
       pred += width;
       ref += ref_stride;
-    }
+    } while (--i != 0);
   } else {
-    assert(width >= 4);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 4) {
-        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
-        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
-        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
-        const uint16x4_t vcomp_pred =
-            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
-        vst1_u16(&comp_pred[j], vcomp_pred);
-      }
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
       comp_pred += width;
       pred += width;
       ref += ref_stride;
-    }
+    } while (--i != 0);
   }
 }

From 660031ccf380e6a37beecc67e78154b7b6ea78d8 Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Mon, 13 Feb 2023 17:57:26 -0800
Subject: [PATCH 537/926] Enable some more speed features on speed 0 to 2

Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|---------|----------|----------|---------|-------|
|    0    | hdres2  | +0.034%  | +0.030%  | +0.033% | -3.7% |
|    0    | lowres2 | +0.012%  | +0.017%  | +0.044% | -2.1% |
|    0    | midres2 | +0.030%  | +0.035%  | +0.060% | -1.9% |
|---------|---------|----------|----------|---------|-------|
|    1    | hdres2  | +0.027%  | +0.036%  | +0.030% | -2.7% |
|    1    | lowres2 | -0.006%  | -0.002%  | +0.006% | -1.0% |
|    1    | midres2 | -0.006%  | -0.012%  | -0.010% | -1.0% |
|---------|---------|----------|----------|---------|-------|
|    2    | hdres2  | -0.006%  | -0.001%  | -0.020% | -2.4% |
|    2    | lowres2 | -0.010%  | -0.015%  | -0.001% | -0.9% |
|    2    | midres2 | +0.006%  | -0.005%  | +0.009% | -1.0% |

STATS_CHANGED

Change-Id: I1431ac07215bb844739a410697387b9aead82792
---
 vp9/encoder/vp9_speed_features.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 19e37537b2..58e9e739a2 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -84,6 +84,9 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   } else {
     sf->use_square_only_thresh_high = BLOCK_32X32;
   }
+  if (is_720p_or_larger) {
+    sf->alt_ref_search_fp = 1;
+  }
 
   if (!is_1080p_or_larger) {
     sf->rd_ml_partition.search_breakout = 1;
@@ -212,6 +215,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   const int boosted = frame_is_boosted(cpi);
   int i;
 
+  sf->adaptive_interp_filter_search = 1;
   sf->adaptive_pred_interp_filter = 1;
   sf->adaptive_rd_thresh = 1;
   sf->adaptive_rd_thresh_row_mt = 0;
@@ -223,6 +227,9 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->tx_size_search_breakout = 1;
   sf->use_square_partition_only = !boosted;
 
+  // Reference masking is not supported in dynamic scaling mode.
+  sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC;
+
   sf->rd_ml_partition.var_pruning = 1;
   sf->rd_ml_partition.prune_rect_thresh[0] = -1;
   sf->rd_ml_partition.prune_rect_thresh[1] = 350;
@@ -299,9 +306,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->tx_size_search_method =
         frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
 
-    // Reference masking is not supported in dynamic scaling mode.
-    sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0;
-
     sf->mode_search_skip_flags =
         (cm->frame_type == KEY_FRAME)
             ? 0
@@ -347,7 +351,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->mode_skip_start = 6;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
-    sf->adaptive_interp_filter_search = 1;
 
     if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
       for (i = 0; i < MAX_MESH_STEP; ++i) {

From b737865480d2f1355a972f2f9b3b3a0f34a9ef83 Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Tue, 14 Feb 2023 14:29:29 -0800
Subject: [PATCH 538/926] Relax frame recode tolerance on speed 0 to 1 above
 480p

Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|---------|----------|----------|---------|-------|
|    0    | hdres2  | -0.028%  | +0.030%  | -0.408% | -2.0% |
|    0    | lowres2 | +0.000%  | +0.000%  | +0.000% | +0.0% |
|    0    | midres2 | -0.138%  | +0.042%  | -0.427% | -2.5% |
|---------|---------|----------|----------|---------|-------|
|    1    | hdres2  | -0.032%  | +0.018%  | -0.342% | -1.1% |
|    1    | lowres2 | +0.000%  | +0.000%  | +0.000% | +0.0% |
|    1    | midres2 | +0.050%  | +0.060%  | -0.257% | -1.6% |

Rate Error:
|         |         |     AVG_RC_ERROR    |     MAX_RC_ERROR    |
|         |         |---------------------|---------------------|
| SPD_SET | TESTSET |   BASE   |   TEST   |   BASE   |   TEST   |
|---------|---------|----------|----------|----------|----------|
|    0    | hdres2  |  33.044% |  33.065% | 149.903% | 149.903% |
|    0    | midres2 |  59.632% |  59.566% |  79.091% |  79.249% |
|---------|---------|----------|----------|----------|----------|
|    1    | hdres2  |  33.050% |  33.057% | 151.278% | 151.278% |
|    1    | midres2 |  59.640% |  59.614% |  78.707% |  78.842% |

STATS_CHANGED

Change-Id: I5d09601fede3912d5173717ce9dd070df3a97ec8
---
 vp9/encoder/vp9_speed_features.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 58e9e739a2..72ac0cebb8 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -81,6 +81,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
     // Currently, the machine-learning based partition search early termination
     // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
     sf->rd_ml_partition.search_early_termination = 1;
+    sf->recode_tolerance_high = 45;
   } else {
     sf->use_square_only_thresh_high = BLOCK_32X32;
   }
@@ -314,7 +315,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 45;
     sf->enhanced_full_pixel_motion_search = 0;
     sf->prune_ref_frame_for_rect_partitions = 0;

From be2fd0c7406bcf7297220d3fa43d4baa2f619c7b Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 16 Feb 2023 17:48:49 -0500
Subject: [PATCH 539/926] vp9 rc: Verify QP for all spatial layers

Change-Id: Ic669c96d25d7c039d370e9acd00dc45e09054552
---
 test/vp9_ratectrl_rtc_test.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index cce73fcce2..ad421d3acd 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -235,6 +235,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
     for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
+    std::vector<int> rc_qp;
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
       ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
                            pkt->data.frame.sz);
@@ -252,14 +253,17 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
           rc_api_->ComputeQP(frame_params_);
           frame_params_.frame_type = libvpx::RcFrameType::kInterFrame;
           rc_api_->PostEncodeUpdate(sizes_[sl]);
+          rc_qp.push_back(rc_api_->GetQP());
         }
       }
     }
     if (!encoder_exit_) {
-      int loopfilter_level, qp;
+      int loopfilter_level;
+      std::vector<int> encoder_qp(VPX_SS_MAX_LAYERS, 0);
       encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
-      encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
-      ASSERT_EQ(rc_api_->GetQP(), qp);
+      encoder->Control(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, encoder_qp.data());
+      encoder_qp.resize(rc_qp.size());
+      ASSERT_EQ(rc_qp, encoder_qp);
       ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
     }
   }

From 0f888815c546f7b3bda0795d0a585203bde406bc Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 11 Nov 2022 14:21:27 -0500
Subject: [PATCH 540/926] vp9 rc: Make it work for SVC parallel encoding

Added unit test.

Keep track of spatial layer id and frame type in case where spatial
layers are encoded parallel by the hardware encoder.

ComputeQP() / PostEncodeUpdate() doesn't need to be called sequentially
when there is no inter layer prediction.

Bug: b/257368998
Change-Id: I50beaefcfc205d3f9a9d3dbe11fead5bfdc71489
---
 test/vp9_ratectrl_rtc_test.cc      | 96 ++++++++++++++++++++++--------
 vp9/encoder/vp9_svc_layercontext.c |  5 ++
 vp9/encoder/vp9_svc_layercontext.h |  3 +
 vp9/ratectrl_rtc.cc                | 29 ++++++++-
 vp9/ratectrl_rtc.h                 |  3 +-
 5 files changed, 110 insertions(+), 26 deletions(-)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index ad421d3acd..578ad26fcf 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -82,7 +82,7 @@ class RcInterfaceTest
   }
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_);
   }
 
   void RunOneLayer() {
@@ -162,10 +162,14 @@ class RcInterfaceTest
   bool encoder_exit_;
 };
 
-class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
-                           public ::libvpx_test::CodecTestWithParam<int> {
+class RcInterfaceSvcTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
  public:
-  RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)) {}
+  RcInterfaceSvcTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)),
+        parallel_spatial_layers_(false) {}
   virtual ~RcInterfaceSvcTest() {}
 
  protected:
@@ -184,6 +188,10 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      if (inter_layer_pred_off_) {
+        encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED,
+                         INTER_LAYER_PRED_OFF_NONKEY);
+      }
     }
     frame_params_.frame_type = video->frame() % key_interval_ == 0
                                    ? libvpx::RcFrameType::kKeyFrame
@@ -232,6 +240,22 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     }
   }
 
+  virtual void SetFrameParamsSvc(int sl) {
+    frame_params_.spatial_layer_id = sl;
+    if (rc_cfg_.ts_number_layers == 3)
+      frame_params_.temporal_layer_id =
+          kTemporalId3Layer[current_superframe_ % 4];
+    else if (rc_cfg_.ts_number_layers == 2)
+      frame_params_.temporal_layer_id =
+          kTemporalId2Layer[current_superframe_ % 2];
+    else
+      frame_params_.temporal_layer_id = 0;
+    frame_params_.frame_type =
+        current_superframe_ % key_interval_ == 0 && sl == 0
+            ? libvpx::RcFrameType::kKeyFrame
+            : libvpx::RcFrameType::kInterFrame;
+  }
+
   virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
     for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
@@ -239,21 +263,28 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
       ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
                            pkt->data.frame.sz);
-      for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
-        if (sizes_[sl] > 0) {
-          frame_params_.spatial_layer_id = sl;
-          if (rc_cfg_.ts_number_layers == 3)
-            frame_params_.temporal_layer_id =
-                kTemporalId3Layer[current_superframe_ % 4];
-          else if (rc_cfg_.ts_number_layers == 2)
-            frame_params_.temporal_layer_id =
-                kTemporalId2Layer[current_superframe_ % 2];
-          else
-            frame_params_.temporal_layer_id = 0;
-          rc_api_->ComputeQP(frame_params_);
-          frame_params_.frame_type = libvpx::RcFrameType::kInterFrame;
-          rc_api_->PostEncodeUpdate(sizes_[sl]);
-          rc_qp.push_back(rc_api_->GetQP());
+      if (!parallel_spatial_layers_ || current_superframe_ == 0) {
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->ComputeQP(frame_params_);
+            rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
+            rc_qp.push_back(rc_api_->GetQP());
+          }
+        }
+      } else {
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->ComputeQP(frame_params_);
+          }
+        }
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
+            rc_qp.push_back(rc_api_->GetQP());
+          }
         }
       }
     }
@@ -274,9 +305,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
                             const vpx_image_t * /*img2*/) {}
 
   void RunSvc() {
-    dynamic_spatial_layers_ = 0;
     SetRCConfigSvc(3, 3);
-    key_interval_ = 10000;
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
     SetEncoderConfigSvc(3, 3);
 
@@ -287,7 +316,6 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   }
 
   void RunSvcPeriodicKey() {
-    dynamic_spatial_layers_ = 0;
     SetRCConfigSvc(3, 3);
     key_interval_ = 100;
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
@@ -302,7 +330,19 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   void RunSvcDynamicSpatial() {
     dynamic_spatial_layers_ = 1;
     SetRCConfigSvc(3, 3);
-    key_interval_ = 10000;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcParallelSpatialLayers() {
+    if (!inter_layer_pred_off_) return;
+    parallel_spatial_layers_ = true;
+    SetRCConfigSvc(3, 3);
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
     SetEncoderConfigSvc(3, 3);
 
@@ -505,6 +545,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   uint32_t sizes_[8];
   int key_interval_;
   int dynamic_spatial_layers_;
+  bool inter_layer_pred_off_;
+  // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC.
+  bool parallel_spatial_layers_;
 };
 
 TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
@@ -513,11 +556,16 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
+TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) {
+  RunSvcParallelSpatialLayers();
+}
+
 TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); }
 
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
-VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3),
+                           ::testing::Values(true, false));
 }  // namespace
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 7e9435fb5f..c60445cba5 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -389,6 +389,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
   lc->twopass = cpi->twopass;
   lc->target_bandwidth = (int)oxcf->target_bandwidth;
   lc->alt_ref_source = cpi->alt_ref_source;
+  lc->frame_qp = cpi->common.base_qindex;
+  lc->MBs = cpi->common.MBs;
 
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
@@ -408,6 +410,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
     lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
     lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
     lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+    lc->qindex_delta[0] = cr->qindex_delta[0];
+    lc->qindex_delta[1] = cr->qindex_delta[1];
+    lc->qindex_delta[2] = cr->qindex_delta[2];
   }
 }
 
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index c7328cf571..90dec5e20a 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -70,8 +70,11 @@ typedef struct {
   int actual_num_seg1_blocks;
   int actual_num_seg2_blocks;
   int counter_encode_maxq_scene_change;
+  int qindex_delta[3];
   uint8_t speed;
   int loopfilter_ctrl;
+  int frame_qp;
+  int MBs;
 } LAYER_CONTEXT;
 
 typedef struct SVC {
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 944c526ac1..8592173fb6 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -220,6 +220,9 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
       vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index);
 
   if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_);
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1)
+    vp9_save_layer_context(cpi_);
 }
 
 int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; }
@@ -242,7 +245,31 @@ bool VP9RateControlRTC::GetSegmentationData(
   return true;
 }
 
-void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+void VP9RateControlRTC::PostEncodeUpdate(
+    uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) {
+  cpi_->common.frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+  cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
+  cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id;
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1) {
+    vp9_restore_layer_context(cpi_);
+    const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+                                       cpi_->svc.temporal_layer_id,
+                                       cpi_->svc.number_temporal_layers);
+    LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+    cpi_->common.base_qindex = lc->frame_qp;
+    cpi_->common.MBs = lc->MBs;
+    // For spatial-svc, allow cyclic-refresh to be applied on the spatial
+    // layers, for the base temporal layer.
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi_->svc.number_spatial_layers > 1 &&
+        cpi_->svc.temporal_layer_id == 0) {
+      CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh;
+      cr->qindex_delta[0] = lc->qindex_delta[0];
+      cr->qindex_delta[1] = lc->qindex_delta[1];
+      cr->qindex_delta[2] = lc->qindex_delta[2];
+    }
+  }
   vp9_rc_postencode_update(cpi_, encoded_frame_size);
   if (cpi_->svc.number_spatial_layers > 1 ||
       cpi_->svc.number_temporal_layers > 1)
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index 162a04883e..3131c22231 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -99,7 +99,8 @@ class VP9RateControlRTC {
   bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
   void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
-  void PostEncodeUpdate(uint64_t encoded_frame_size);
+  void PostEncodeUpdate(uint64_t encoded_frame_size,
+                        const VP9FrameParamsQpRTC &frame_params);
 
  private:
   VP9RateControlRTC() {}

From c4ee2b2f033d377427017b2b8244e5f29fe80961 Mon Sep 17 00:00:00 2001
From: Deepa K G <deepa.kg@ittiam.com>
Date: Thu, 16 Feb 2023 21:47:24 +0530
Subject: [PATCH 541/926] =?UTF-8?q?Skip=20redundant=20iterations=20in=20jo?=
 =?UTF-8?q?int=20motion=20search=C2=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In joint_motion_search, there are four iterations.
Even iterations search in the first reference frame
and odd iterations search in the second. The last two
iterations use the search result of the first two
iterations as the start point. If the search result does
not change,last two iterations are not necessary and can
be skipped.

          Instruction Count
cpu-used   Reduction(%)
  0          1.411

Change-Id: Ie583c9f75dd0a22bbdfb432ccdd62eea6ec4fce8
---
 vp9/encoder/vp9_rdopt.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 498bc0fbd5..201bf416db 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1862,6 +1862,19 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
   return 1;
 }
 
+static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+  if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
+    int_mv cur_fullpel_mv, prev_fullpel_mv;
+    cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
+    cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3;
+    prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3;
+    prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3;
+    if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1;
+  }
+  return 0;
+}
+
+#define NUM_ITERS 4
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                 int_mv *frame_mv, int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
@@ -1874,6 +1887,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   const int refs[2] = { mi->ref_frame[0],
                         mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] };
   int_mv ref_mv[2];
+  int_mv iter_mvs[NUM_ITERS][2];
   int ite, ref;
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
   struct scale_factors sf;
@@ -1909,6 +1923,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     }
 
     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+    iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int;
   }
 
 // Since we have scaled the reference frames to match the size of the current
@@ -1923,7 +1938,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
-  for (ite = 0; ite < 4; ite++) {
+  for (ite = 0; ite < NUM_ITERS; ite++) {
     struct buf_2d ref_yv12[2];
     uint32_t bestsme = UINT_MAX;
     int sadpb = x->sadperbit16;
@@ -1935,6 +1950,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
 
+    // Skip further iterations of search if in the previous iteration, the
+    // motion vector of the searched ref frame is unchanged, and the other ref
+    // frame's full-pixel mv is unchanged.
+    if (skip_iters(iter_mvs, ite, id)) break;
+
     // Initialized here because of compiler problem in Visual Studio.
     ref_yv12[0] = xd->plane[0].pre[0];
     ref_yv12[1] = xd->plane[0].pre[1];
@@ -2000,6 +2020,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     } else {
       break;
     }
+    if (ite < NUM_ITERS - 1) {
+      iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int;
+      iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int;
+    }
   }
 
   *rate_mv = 0;

From 6ed9639e43983a95272d3a2787aa9aa0d23ffcbb Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 13 Feb 2023 16:11:31 +0000
Subject: [PATCH 542/926] Optimize Neon implementation of high bitpdeth
 variance functions

Specialize implementation of high bitdepth variance functions such that
we only widen data processing element types when absolutely necessary.

Change-Id: If4cc3fea7b5ab0821e3129ebd79ff63706a512bf
---
 vpx_dsp/arm/highbd_variance_neon.c | 548 +++++++++++++++++++----------
 vpx_dsp/arm/sum_neon.h             |  17 +
 2 files changed, 373 insertions(+), 192 deletions(-)

diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
index 985cc35682..89bd5c579d 100644
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -18,214 +18,378 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
-                                     const uint16_t *ref_ptr, int ref_stride,
-                                     int w, int h, uint64_t *sse,
-                                     int64_t *sum) {
-  int i, j;
-
-  if (w >= 8) {
-    int32x4_t sum_s32 = vdupq_n_s32(0);
-    uint32x4_t sse_u32 = vdupq_n_u32(0);
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j]));
-        const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j]));
-        const int32x4_t diff1_s32 =
-            vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16));
-        const int32x4_t diff2_s32 =
-            vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16));
-        const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32);
-        const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32);
-        sum_s32 = vaddq_s32(sum_s32, diff1_s32);
-        sum_s32 = vaddq_s32(sum_s32, diff2_s32);
-        sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32);
-        sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32);
-      }
-      src_ptr += src_stride;
-      ref_ptr += ref_stride;
-    }
-    *sum = horizontal_add_int32x4(sum_s32);
-    *sse = horizontal_add_uint32x4(sse_u32);
-  } else {
-    int32x4_t sum_s32 = vdupq_n_s32(0);
-    uint32x4_t sse_u32 = vdupq_n_u32(0);
-    assert(w >= 4);
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 4) {
-        const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j]));
-        const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j]));
-        const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16);
-        const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32);
-        sum_s32 = vaddq_s32(sum_s32, diff_s32);
-        sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32);
-      }
-      src_ptr += src_stride;
-      ref_ptr += ref_stride;
-    }
-    *sum = horizontal_add_int32x4(sum_s32);
-    *sse = horizontal_add_uint32x4(sse_u32);
-  }
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+
+  int i = h;
+  do {
+    const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_int32x4(sse_s32);
 }
 
-static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
-                                     const uint8_t *ref8_ptr, int ref_stride,
-                                     int w, int h, uint64_t *sse,
-                                     int64_t *sum) {
-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
-  uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
-
-  if (w < 32 && h < 32) {
-    highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum);
-  } else {
-    uint64_t sse_long = 0;
-    int64_t sum_long = 0;
-    int k, l;
-    for (k = 0; k + 16 <= h; k += 16) {
-      for (l = 0; l + 16 <= w; l += 16) {
-        uint64_t sse_tmp = 0;
-        int64_t sum_tmp = 0;
-        highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16,
-                          16, &sse_tmp, &sum_tmp);
-        sum_long += sum_tmp;
-        sse_long += sse_tmp;
-      }
-      src_ptr += 16 * src_stride;
-      ref_ptr += 16 * ref_stride;
-    }
-    *sum = sum_long;
-    *sse = sse_long;
-  }
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696
+// for a 64x64 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+                                              int src_stride,
+                                              const uint16_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint16x8_t s = vld1q_u16(src_ptr + j);
+      const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+      sum_s32 = vpadalq_s16(sum_s32, diff);
+
+      sse_s32[0] =
+          vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+      sse_s32[1] =
+          vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = horizontal_long_add_uint32x4(vaddq_u32(
+      vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
+}
+
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
+}
+
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
 }
 
-static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
-                                     const uint8_t *ref8_ptr, int ref_stride,
-                                     int w, int h, uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
-                    &sum_long);
-  *sse = (uint32_t)sse_long;
-  *sum = (int)sum_long;
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different
+// helper function.
+
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+    const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+    int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+  // accumulator overflows. After hitting this limit we accumulate into 64-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+        const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+        const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+        sum_s32 = vpadalq_s16(sum_s32, diff);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+        j += 8;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+    h_tmp += h_limit;
+  } while (i < h);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = horizontal_add_int64x2(sse_s64);
 }
 
-static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
-                                      const uint8_t *ref8_ptr, int ref_stride,
-                                      int w, int h, uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
-                    &sum_long);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+static INLINE void highbd_variance_32xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+                              sum);
 }
 
-static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
-                                      const uint8_t *ref8_ptr, int ref_stride,
-                                      int w, int h, uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
-                    &sum_long);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+static INLINE void highbd_variance_64xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+                              sum);
 }
 
-#define HBD_VARIANCE_WXH_NEON(W, H)                                         \
-  uint32_t vpx_highbd_8_variance##W##x##H##_neon(                           \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
-                      &sum);                                                \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
-  }                                                                         \
-                                                                            \
-  uint32_t vpx_highbd_10_variance##W##x##H##_neon(                          \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
-                       &sum);                                               \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }                                                                         \
-                                                                            \
-  uint32_t vpx_highbd_12_variance##W##x##H##_neon(                          \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
-                       &sum);                                               \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
+#define HBD_VARIANCE_WXH_8_NEON(w, h)                                 \
+  uint32_t vpx_highbd_8_variance##w##x##h##_neon(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
+  }
+
+#define HBD_VARIANCE_WXH_10_NEON(w, h)                                \
+  uint32_t vpx_highbd_10_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
   }
 
-#define HIGHBD_GET_VAR(S)                                                   \
-  void vpx_highbd_8_get##S##x##S##var_neon(                                 \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse, int *sum) {                            \
-    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse,  \
-                      sum);                                                 \
-  }                                                                         \
-                                                                            \
-  void vpx_highbd_10_get##S##x##S##var_neon(                                \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse, int *sum) {                            \
-    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
-                       sum);                                                \
-  }                                                                         \
-                                                                            \
-  void vpx_highbd_12_get##S##x##S##var_neon(                                \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse, int *sum) {                            \
-    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
-                       sum);                                                \
+#define HBD_VARIANCE_WXH_12_NEON(w, h)                                \
+  uint32_t vpx_highbd_12_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
   }
 
-#define HIGHBD_MSE(W, H)                                                    \
-  uint32_t vpx_highbd_8_mse##W##x##H##_neon(                                \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
-                      &sum);                                                \
-    return *sse;                                                            \
-  }                                                                         \
-                                                                            \
-  uint32_t vpx_highbd_10_mse##W##x##H##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
-                       &sum);                                               \
-    return *sse;                                                            \
-  }                                                                         \
-                                                                            \
-  uint32_t vpx_highbd_12_mse##W##x##H##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
-                       &sum);                                               \
-    return *sse;                                                            \
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h)                                \
+  uint32_t vpx_highbd_12_variance##w##x##h##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride, uint32_t *sse) {                                       \
+    int sum;                                                                 \
+    int64_t var;                                                             \
+    uint64_t sse_long = 0;                                                   \
+    int64_t sum_long = 0;                                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                            \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
+    highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+                                        &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                        \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                              \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                   \
+  }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+
+#define HIGHBD_GET_VAR(S)                                             \
+  void vpx_highbd_8_get##S##x##S##var_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    *sum = (int)sum_long;                                             \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_10_get##S##x##S##var_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                      \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_12_get##S##x##S##var_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                      \
   }
 
-HBD_VARIANCE_WXH_NEON(64, 64)
-HBD_VARIANCE_WXH_NEON(64, 32)
-HBD_VARIANCE_WXH_NEON(32, 64)
-HBD_VARIANCE_WXH_NEON(32, 32)
-HBD_VARIANCE_WXH_NEON(32, 16)
-HBD_VARIANCE_WXH_NEON(16, 32)
-HBD_VARIANCE_WXH_NEON(16, 16)
-HBD_VARIANCE_WXH_NEON(16, 8)
-HBD_VARIANCE_WXH_NEON(8, 16)
-HBD_VARIANCE_WXH_NEON(8, 8)
-HBD_VARIANCE_WXH_NEON(8, 4)
-HBD_VARIANCE_WXH_NEON(4, 8)
-HBD_VARIANCE_WXH_NEON(4, 4)
+#define HIGHBD_MSE(w, h)                                              \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    return *sse;                                                      \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_10_mse##w##x##h##_neon(                         \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    return *sse;                                                      \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_12_mse##w##x##h##_neon(                         \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    return *sse;                                                      \
+  }
 
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 21560837ae..47748a8061 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -110,4 +110,21 @@ static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
 #endif
 }
 
+static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_int64x2(const int64x2_t a) {
+#if defined(__aarch64__)
+  return vaddvq_s64(a);
+#else
+  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_

From 46add73f7e60799fab383c5dcbad0b953eee0c7a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 11:34:30 -0800
Subject: [PATCH 543/926] vp9_block.h: rename diff struct to Diff

This matches the style guide and fixes some -Wshadow warnings related to
variables with the same name. Something similar was done in libaom in:
863b04994b Fix warnings reported by -Wshadow: Part2: av1 directory

Bug: webm:1793
Change-Id: I4df1bbc8d079a3174d75f0d35d54c200ffdbb677
---
 vp9/encoder/vp9_block.h       |  2 +-
 vp9/encoder/vp9_encodeframe.c | 14 +++++++-------
 vp9/encoder/vp9_encoder.c     |  3 ++-
 vp9/encoder/vp9_encoder.h     |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 20294b4b94..1786952911 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -24,7 +24,7 @@ typedef struct {
   unsigned int sse;
   int sum;
   unsigned int var;
-} diff;
+} Diff;
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a522097e61..5b811016de 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2432,16 +2432,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
       (row8x8_remaining >= MI_BLOCK_SIZE)) {
     int i, j;
     int index;
-    diff d32[4];
+    Diff d32[4];
     const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1);
     int is_larger_better = 0;
     int use32x32 = 0;
     unsigned int thr = cpi->source_var_thresh;
 
-    memset(d32, 0, 4 * sizeof(diff));
+    memset(d32, 0, sizeof(d32));
 
     for (i = 0; i < 4; i++) {
-      diff *d16[4];
+      Diff *d16[4];
 
       for (j = 0; j < 4; j++) {
         int b_mi_row = coord_lookup[i * 4 + j].row;
@@ -5681,12 +5681,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
 }
 // end RTC play code
 
-static INLINE uint32_t variance(const diff *const d) {
+static INLINE uint32_t variance(const Diff *const d) {
   return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint32_t variance_highbd(diff *const d) {
+static INLINE uint32_t variance_highbd(Diff *const d) {
   const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8);
   return (var >= 0) ? (uint32_t)var : 0;
 }
@@ -5706,7 +5706,7 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
                          ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100)
                          : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
   DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
-  diff *var16 = cpi->source_diff_var;
+  Diff *var16 = cpi->source_diff_var;
 
   int sum = 0;
   int i, j;
@@ -5790,7 +5790,7 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
       if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
 
       CHECK_MEM_ERROR(cm, cpi->source_diff_var,
-                      vpx_calloc(cm->MBs, sizeof(diff)));
+                      vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
     }
 
     if (!cpi->frames_till_next_var_check)
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 22fbb899fd..4cec02eb93 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2546,7 +2546,8 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL;
 
   // Allocate memory to store variances for a frame.
-  CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
+  CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+                  vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 79c0b36a17..77de5c8754 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -920,7 +920,7 @@ typedef struct VP9_COMP {
   SVC svc;
 
   // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
-  diff *source_diff_var;
+  Diff *source_diff_var;
   // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
   unsigned int source_var_thresh;
   int frames_till_next_var_check;

From 3712a5869caa05f2ff8e9087cf7583f349d4e23e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 12:54:21 -0800
Subject: [PATCH 544/926] vpx_subpixel_8t_intrin_avx2: clear -Wshadow warnings

no changes to assembly

Bug: webm:1793
Change-Id: I6a82290cafee7f4a7909d497ccfdefd5a78fb8ed
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 31 ++++++++++-------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index c7d880860e..841db7cd71 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -534,9 +534,6 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
   int h;
 
-  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
-  __m256i dst_reg;
-  __m256i tmp_0, tmp_1;
   __m256i idx_shift_0 =
       _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
                        2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
@@ -557,9 +554,11 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
 
   for (h = height; h >= 2; h -= 2) {
     // Load the source
-    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
-    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
-    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+    const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    __m256i dst_reg;
+    __m256i tmp_0, tmp_1;
+    const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
 
     // Get the output
     tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
@@ -580,9 +579,9 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
 
   // Repeat for the last row if needed
   if (h > 0) {
-    __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
     __m128i dst_reg;
-    const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+    const __m128i reg_32_128 = _mm_set1_epi16(32);  // Used for rounding
     __m128i tmp_0, tmp_1;
 
     __m128i src_reg_shift_0 =
@@ -596,7 +595,7 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
                               _mm256_castsi256_si128(kernel_reg_45));
     dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
 
-    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32, 6);
+    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32_128, 6);
 
     dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
 
@@ -715,8 +714,6 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t unrolled_src_stride = src_stride << 1;
   const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
 
-  __m256i src_reg, src_reg_shuf;
-  __m256i dst;
   __m256i shuf_idx =
       _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
                        3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
@@ -733,12 +730,12 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
 
   for (h = height; h > 1; h -= 2) {
     // Load the source
-    src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr,
-                                 (const __m128i *)(src_ptr + src_stride));
-    src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+    const __m256i src_reg = mm256_loadu2_epi64(
+        (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride));
+    const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
 
     // Get the result
-    dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+    __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
     dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
 
     // Round result
@@ -757,7 +754,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
 
   if (h > 0) {
     // Load the source
-    const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+    const __m128i reg_32_128 = _mm_set1_epi16(32);  // Used for rounding
     __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
     __m128i src_reg_shuf =
         _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
@@ -768,7 +765,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
     dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
 
     // Round result
-    dst = mm_round_epi16_sse2(&dst, &reg_32, 6);
+    dst = mm_round_epi16_sse2(&dst, &reg_32_128, 6);
 
     // Pack to 8-bits
     dst = _mm_packus_epi16(dst, _mm_setzero_si128());

From 4ba3be9324497d33d51d3b8a8ea900505e6f1450 Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Wed, 22 Feb 2023 12:44:47 -0800
Subject: [PATCH 545/926] Disable some intra modes for TX_32X32

Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|---------|----------|----------|---------|-------|
|    0    | hdres2  | +0.036%  | +0.032%  | +0.014% | -3.9% |
|    0    | lowres2 | -0.002%  | -0.011%  | +0.020% | -3.6% |
|    0    | midres2 | +0.045%  | +0.025%  | -0.007% | -4.0% |

STATS_CHANGED

Change-Id: I75a927333d26f2a37f0dda57a641b455b845f5b9
---
 vp9/encoder/vp9_speed_features.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 72ac0cebb8..ce83a97626 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -228,6 +228,8 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->tx_size_search_breakout = 1;
   sf->use_square_partition_only = !boosted;
 
+  sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+
   // Reference masking is not supported in dynamic scaling mode.
   sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC;
 
@@ -281,7 +283,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
     sf->allow_acl = 0;
 
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     if (cpi->oxcf.content != VP9E_CONTENT_FILM) {
       sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;

From f569a4d68c703c0ded5ec71ef20e12aeeb58de1f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 13:21:27 -0800
Subject: [PATCH 546/926] vp9_adapt_mode_probs: clear -Wshadow warning

Bug: webm:1793
Change-Id: Ie4ea8f0a3295e6f58dc6f7d5c61d46700c539d40
---
 vp9/common/vp9_entropymode.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index bda824de3c..9289fc9e1f 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -381,7 +381,6 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
-    int j;
     unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
     unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
     unsigned int branch_ct_32x32p[TX_SIZES - 1][2];

From 76389886ee6bf90e8fe464547eb26bb2311ae698 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 13:25:29 -0800
Subject: [PATCH 547/926] vp9_loop_filter_alloc: clear -Wshadow warnings

Bug: webm:1793
Change-Id: Ia64d175aa69dc2ecde2babf64bde04f02b32795b
---
 vp9/common/vp9_thread_common.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index b3d50162b2..ad4478179e 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -306,7 +306,6 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
     CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
                     vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
     if (lf_sync->recon_done_mutex) {
-      int i;
       for (i = 0; i < rows; ++i) {
         pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
       }
@@ -315,7 +314,6 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
     CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
                     vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
     if (lf_sync->recon_done_cond) {
-      int i;
       for (i = 0; i < rows; ++i) {
         pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
       }

From aab93ee6b62fd3ab489784062ea6f825f2b871da Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 21 Feb 2023 17:40:20 +0000
Subject: [PATCH 548/926] Add Neon implementation of high bitdepth 8x8 hadamard
 transform

Add Neon implementation of vpx_highbd_hadamard_8x8 as well as the
corresponding tests.

Change-Id: I3ef1ff199d76b6b010591ef15a81b0f36c9ded03
---
 test/hadamard_test.cc              |   6 ++
 vpx_dsp/arm/highbd_hadamard_neon.c | 137 +++++++++++++++++++++++++++++
 vpx_dsp/arm/mem_neon.h             |   6 ++
 vpx_dsp/vpx_dsp.mk                 |   3 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl       |   2 +-
 5 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 vpx_dsp/arm/highbd_hadamard_neon.c

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index f904e814ad..2062cbe340 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -324,5 +324,11 @@ INSTANTIATE_TEST_SUITE_P(
                                            32)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardHighbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8)));
+#endif
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 0000000000..615de4b0ce
--- /dev/null
+++ b/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2, int16x8_t *a3,
+                                                   int16x8_t *a4, int16x8_t *a5,
+                                                   int16x8_t *a6,
+                                                   int16x8_t *a7) {
+  int16x8_t b0 = vaddq_s16(*a0, *a1);
+  int16x8_t b1 = vsubq_s16(*a0, *a1);
+  int16x8_t b2 = vaddq_s16(*a2, *a3);
+  int16x8_t b3 = vsubq_s16(*a2, *a3);
+  int16x8_t b4 = vaddq_s16(*a4, *a5);
+  int16x8_t b5 = vsubq_s16(*a4, *a5);
+  int16x8_t b6 = vaddq_s16(*a6, *a7);
+  int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  int16x8_t c0 = vaddq_s16(b0, b2);
+  int16x8_t c2 = vsubq_s16(b0, b2);
+  int16x8_t c1 = vaddq_s16(b1, b3);
+  int16x8_t c3 = vsubq_s16(b1, b3);
+  int16x8_t c4 = vaddq_s16(b4, b6);
+  int16x8_t c6 = vsubq_s16(b4, b6);
+  int16x8_t c5 = vaddq_s16(b5, b7);
+  int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a2 = vsubq_s16(c0, c4);
+  *a7 = vaddq_s16(c1, c5);
+  *a6 = vsubq_s16(c1, c5);
+  *a3 = vaddq_s16(c2, c6);
+  *a1 = vsubq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+                                                    int16x4_t a2, int16x4_t a3,
+                                                    int16x4_t a4, int16x4_t a5,
+                                                    int16x4_t a6, int16x4_t a7,
+                                                    tran_low_t *coeff) {
+  int32x4_t b0 = vaddl_s16(a0, a1);
+  int32x4_t b1 = vsubl_s16(a0, a1);
+  int32x4_t b2 = vaddl_s16(a2, a3);
+  int32x4_t b3 = vsubl_s16(a2, a3);
+  int32x4_t b4 = vaddl_s16(a4, a5);
+  int32x4_t b5 = vsubl_s16(a4, a5);
+  int32x4_t b6 = vaddl_s16(a6, a7);
+  int32x4_t b7 = vsubl_s16(a6, a7);
+
+  int32x4_t c0 = vaddq_s32(b0, b2);
+  int32x4_t c2 = vsubq_s32(b0, b2);
+  int32x4_t c1 = vaddq_s32(b1, b3);
+  int32x4_t c3 = vsubq_s32(b1, b3);
+  int32x4_t c4 = vaddq_s32(b4, b6);
+  int32x4_t c6 = vsubq_s32(b4, b6);
+  int32x4_t c5 = vaddq_s32(b5, b7);
+  int32x4_t c7 = vsubq_s32(b5, b7);
+
+  int32x4_t d0 = vaddq_s32(c0, c4);
+  int32x4_t d2 = vsubq_s32(c0, c4);
+  int32x4_t d7 = vaddq_s32(c1, c5);
+  int32x4_t d6 = vsubq_s32(c1, c5);
+  int32x4_t d3 = vaddq_s32(c2, c6);
+  int32x4_t d1 = vsubq_s32(c2, c6);
+  int32x4_t d4 = vaddq_s32(c3, c7);
+  int32x4_t d5 = vsubq_s32(c3, c7);
+
+  store_s32q_to_tran_low(coeff + 0, d0);
+  store_s32q_to_tran_low(coeff + 4, d1);
+  store_s32q_to_tran_low(coeff + 8, d2);
+  store_s32q_to_tran_low(coeff + 12, d3);
+  store_s32q_to_tran_low(coeff + 16, d4);
+  store_s32q_to_tran_low(coeff + 20, d5);
+  store_s32q_to_tran_low(coeff + 24, d6);
+  store_s32q_to_tran_low(coeff + 28, d7);
+}
+
+void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+  int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+  int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+  int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+  hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  // For the second pass we need to widen to 32-bit elements, so we're
+  // processing 4 columns at a time.
+  // Skip the second transpose because it is not required.
+
+  b0 = vget_low_s16(s0);
+  b1 = vget_low_s16(s1);
+  b2 = vget_low_s16(s2);
+  b3 = vget_low_s16(s3);
+  b4 = vget_low_s16(s4);
+  b5 = vget_low_s16(s5);
+  b6 = vget_low_s16(s6);
+  b7 = vget_low_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+  b0 = vget_high_s16(s0);
+  b1 = vget_high_s16(s1);
+  b2 = vget_high_s16(s2);
+  b3 = vget_high_s16(s3);
+  b4 = vget_high_s16(s4);
+  b5 = vget_high_s16(s5);
+  b6 = vget_high_s16(s6);
+  b7 = vget_high_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 866be7439e..b7a363891e 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -102,6 +102,12 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
 #endif
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
+  vst1q_s32(buf, a);
+}
+#endif
+
 // Propagate type information to the compiler. Without this the compiler may
 // assume the required alignment of uint32_t (4 bytes) and add alignment hints
 // to the memory access.
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 5535f82c07..ab8e5bd817 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -342,6 +342,9 @@ DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
 DSP_SRCS-$(HAVE_AVX2)  += x86/avg_intrin_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_hadamard_neon.c
+endif
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 DSP_SRCS-$(HAVE_LSX)   += loongarch/avg_lsx.c
 ifeq ($(VPX_ARCH_X86_64),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8725821b67..dc3cdc4145 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -802,7 +802,7 @@ ()
     specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_8x8 avx2/;
+    specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_highbd_hadamard_16x16 avx2/;

From 221d76ab9ca3a8e139a92386c814f71fd172d197 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 23 Feb 2023 14:28:30 -0500
Subject: [PATCH 549/926] vp9 rc test: change param type to bool

Change-Id: Ib45522e32d9137678da9062830044e9dd87537e5
---
 test/vp9_ratectrl_rtc_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 578ad26fcf..c6ab5b034f 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -164,7 +164,7 @@ class RcInterfaceTest
 
 class RcInterfaceSvcTest
     : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<int, int> {
+      public ::libvpx_test::CodecTestWith2Params<int, bool> {
  public:
   RcInterfaceSvcTest()
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),

From 6ec45f933c6c4de3fcd9344852bde25d30613321 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 22 Feb 2023 17:27:56 +0000
Subject: [PATCH 550/926] Add Neon implementation of high bitdepth 16x16
 hadamard transform

Add Neon implementation of vpx_highbd_hadamard_16x16 as well as the
corresponding tests.

Change-Id: If3299fe556351dfe3db994ac171d83a95ea1504b
---
 test/hadamard_test.cc              |  4 ++-
 vpx_dsp/arm/highbd_hadamard_neon.c | 39 ++++++++++++++++++++++++++++++
 vpx_dsp/arm/mem_neon.h             |  4 +++
 vpx_dsp/vpx_dsp_rtcd_defs.pl       |  2 +-
 4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 2062cbe340..2482e87cb6 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -327,7 +327,9 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardHighbdTest,
-    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8)));
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon,
+                                           16)));
 #endif
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c
index 615de4b0ce..013f7148f4 100644
--- a/vpx_dsp/arm/highbd_hadamard_neon.c
+++ b/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -135,3 +135,42 @@ void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
 
   hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
 }
+
+void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int i = 0;
+
+  // Rearrange 16x16 to 8x32 and remove stride.
+  // Top left first.
+  vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+  // Top right.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+  // Bottom left.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+                               coeff + 128);
+  // Bottom right.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+                               coeff + 192);
+
+  do {
+    int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+    int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64);
+    int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128);
+    int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192);
+
+    int32x4_t b0 = vhaddq_s32(a0, a1);
+    int32x4_t b1 = vhsubq_s32(a0, a1);
+    int32x4_t b2 = vhaddq_s32(a2, a3);
+    int32x4_t b3 = vhsubq_s32(a2, a3);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    store_s32q_to_tran_low(coeff + 4 * i, c0);
+    store_s32q_to_tran_low(coeff + 4 * i + 64, c1);
+    store_s32q_to_tran_low(coeff + 4 * i + 128, c2);
+    store_s32q_to_tran_low(coeff + 4 * i + 192, c3);
+  } while (++i < 16);
+}
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index b7a363891e..2122956dc6 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -106,6 +106,10 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
 static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
   vst1q_s32(buf, a);
 }
+
+static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
+  return vld1q_s32(buf);
+}
 #endif
 
 // Propagate type information to the compiler. Without this the compiler may
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index dc3cdc4145..276d55baff 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -805,7 +805,7 @@ ()
     specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_16x16 avx2/;
+    specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_highbd_hadamard_32x32 avx2/;

From 111068923b4ca778a680330d00161d7ee93f61e1 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 23 Feb 2023 12:05:30 +0000
Subject: [PATCH 551/926] Add Neon implementation of high bitdepth 32x32
 hadamard transform

Add Neon implementation of vpx_highbd_hadamard_32x32 as well as the
corresponding tests.

Change-Id: I65d8603896649de1996b353aa79eee54824b4708
---
 test/hadamard_test.cc              |  5 ++--
 vpx_dsp/arm/highbd_hadamard_neon.c | 39 ++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl       |  2 +-
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 2482e87cb6..9f6c99f3c4 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -328,8 +328,9 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardHighbdTest,
     ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8),
-                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon,
-                                           16)));
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_neon,
+                                           32)));
 #endif
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c
index 013f7148f4..499eb65462 100644
--- a/vpx_dsp/arm/highbd_hadamard_neon.c
+++ b/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -174,3 +174,42 @@ void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff,
     store_s32q_to_tran_low(coeff + 4 * i + 192, c3);
   } while (++i < 16);
 }
+
+void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int i = 0;
+
+  // Rearrange 32x32 to 16x64 and remove stride.
+  // Top left first.
+  vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+  // Top right.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+  // Bottom left.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+                                 coeff + 512);
+  // Bottom right.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+                                 coeff + 768);
+
+  do {
+    int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+    int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256);
+    int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512);
+    int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768);
+
+    int32x4_t b0 = vhaddq_s32(a0, a1);
+    int32x4_t b1 = vhsubq_s32(a0, a1);
+    int32x4_t b2 = vhaddq_s32(a2, a3);
+    int32x4_t b3 = vhsubq_s32(a2, a3);
+
+    int32x4_t c0 = vhaddq_s32(b0, b2);
+    int32x4_t c1 = vhaddq_s32(b1, b3);
+    int32x4_t c2 = vhsubq_s32(b0, b2);
+    int32x4_t c3 = vhsubq_s32(b1, b3);
+
+    store_s32q_to_tran_low(coeff + 4 * i, c0);
+    store_s32q_to_tran_low(coeff + 4 * i + 256, c1);
+    store_s32q_to_tran_low(coeff + 4 * i + 512, c2);
+    store_s32q_to_tran_low(coeff + 4 * i + 768, c3);
+  } while (++i < 64);
+}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 276d55baff..eef72249e0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -808,7 +808,7 @@ ()
     specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_32x32 avx2/;
+    specialize qw/vpx_highbd_hadamard_32x32 avx2 neon/;
 
     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon/;

From 5b2d3d5e4242f63e0f3cb673dac245b739c4423d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 24 Feb 2023 19:25:39 -0800
Subject: [PATCH 552/926] tools_common,VpxInterface: fix interface fn ptr proto

Use (void) to indicate an empty parameter list and match the declaration
of vpx_codec_vp[89]_[cd]x. This fixes a cfi sanitizer error.

Change-Id: I190f432eea4d1765afffd84c7458ec44d863f90c
---
 tools_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools_common.h b/tools_common.h
index b9cfb9cc85..3a266416e3 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -147,7 +147,7 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 typedef struct VpxInterface {
   const char *const name;
   const uint32_t fourcc;
-  vpx_codec_iface_t *(*const codec_interface)();
+  vpx_codec_iface_t *(*const codec_interface)(void);
 } VpxInterface;
 
 int get_vpx_encoder_count(void);

From b25cca8c2edba5fbc18448007da2624a25113f4d Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 25 Feb 2023 00:43:46 +0000
Subject: [PATCH 553/926] Optimize transpose_neon.h helper functions

1) Use vtrn[12]q_[su]64 in vpx_vtrnq_[su]64* helpers on AArch64
   targets. This produces half as many TRN1/2 instructions compared to
   the number of MOVs that result from vcombine.

2) Use vpx_vtrnq_[su]64* helpers wherever applicable.

3) Refactor transpose_4x8_s16 to operate on 128-bit vectors.

Change-Id: I9a8b1c1fe2a98a429e0c5f39def5eb2f65759127
---
 vpx_dsp/arm/transpose_neon.h | 108 +++++++++++++++++------------------
 1 file changed, 53 insertions(+), 55 deletions(-)

diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 48292c6936..518278f303 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -39,26 +39,45 @@ static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
 
 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   int32x4x2_t b0;
+#if defined(__aarch64__)
+  b0.val[0] = vreinterpretq_s32_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s32_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
   return b0;
 }
 
 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   int64x2x2_t b0;
+#if defined(__aarch64__)
+  b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+  b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+#else
   b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
                            vreinterpret_s64_s32(vget_low_s32(a1)));
   b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
                            vreinterpret_s64_s32(vget_high_s32(a1)));
+#endif
   return b0;
 }
 
 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
+#if defined(__aarch64__)
+  b0.val[0] = vreinterpretq_u8_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u8_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
                           vreinterpret_u8_u32(vget_low_u32(a1)));
   b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
                           vreinterpret_u8_u32(vget_high_u32(a1)));
+#endif
   return b0;
 }
 
@@ -155,17 +174,13 @@ static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
   // c0: 00 01 20 21  02 03 22 23
   // c1: 10 11 30 31  12 13 32 33
 
-  const int32x4_t c0 =
-      vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
-  const int32x4_t c1 =
-      vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
+  const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
 
   // Swap 16 bit elements resulting in:
   // d0.val[0]: 00 10 20 30  02 12 22 32
   // d0.val[1]: 01 11 21 31  03 13 23 33
 
-  const int16x8x2_t d0 =
-      vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
+  const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d0.val[1];
@@ -186,17 +201,13 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
   // c0: 00 01 20 21  02 03 22 23
   // c1: 10 11 30 31  12 13 32 33
 
-  const uint32x4_t c0 =
-      vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
-  const uint32x4_t c1 =
-      vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
+  const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
 
   // Swap 16 bit elements resulting in:
   // d0.val[0]: 00 10 20 30  02 12 22 32
   // d0.val[1]: 01 11 21 31  03 13 23 33
 
-  const uint16x8x2_t d0 =
-      vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
+  const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d0.val[1];
@@ -295,7 +306,7 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
                                      const int16x4_t a6, const int16x4_t a7,
                                      int16x8_t *const o0, int16x8_t *const o1,
                                      int16x8_t *const o2, int16x8_t *const o3) {
-  // Swap 16 bit elements. Goes from:
+  // Combine rows. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
   // a2: 20 21 22 23
@@ -305,53 +316,40 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
   // a6: 60 61 62 63
   // a7: 70 71 72 73
   // to:
-  // b0.val[0]: 00 10 02 12
-  // b0.val[1]: 01 11 03 13
-  // b1.val[0]: 20 30 22 32
-  // b1.val[1]: 21 31 23 33
-  // b2.val[0]: 40 50 42 52
-  // b2.val[1]: 41 51 43 53
-  // b3.val[0]: 60 70 62 72
-  // b3.val[1]: 61 71 63 73
+  // b0: 00 01 02 03 40 41 42 43
+  // b1: 10 11 12 13 50 51 52 53
+  // b2: 20 21 22 23 60 61 62 63
+  // b3: 30 31 32 33 70 71 72 73
+
+  const int16x8_t b0 = vcombine_s16(a0, a4);
+  const int16x8_t b1 = vcombine_s16(a1, a5);
+  const int16x8_t b2 = vcombine_s16(a2, a6);
+  const int16x8_t b3 = vcombine_s16(a3, a7);
 
-  const int16x4x2_t b0 = vtrn_s16(a0, a1);
-  const int16x4x2_t b1 = vtrn_s16(a2, a3);
-  const int16x4x2_t b2 = vtrn_s16(a4, a5);
-  const int16x4x2_t b3 = vtrn_s16(a6, a7);
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 02 12 40 50 42 52
+  // c0.val[1]: 01 11 03 13 41 51 43 53
+  // c1.val[0]: 20 30 22 32 60 70 62 72
+  // c1.val[1]: 21 31 23 33 61 71 63 73
+
+  const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+  const int16x8x2_t c1 = vtrnq_s16(b2, b3);
 
   // Swap 32 bit elements resulting in:
-  // c0.val[0]: 00 10 20 30
-  // c0.val[1]: 02 12 22 32
-  // c1.val[0]: 01 11 21 31
-  // c1.val[1]: 03 13 23 33
-  // c2.val[0]: 40 50 60 70
-  // c2.val[1]: 42 52 62 72
-  // c3.val[0]: 41 51 61 71
-  // c3.val[1]: 43 53 63 73
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 02 12 22 32 42 52 62 72
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 03 13 23 33 43 53 63 73
 
-  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
-                                  vreinterpret_s32_s16(b1.val[0]));
-  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
-                                  vreinterpret_s32_s16(b1.val[1]));
-  const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
-                                  vreinterpret_s32_s16(b3.val[0]));
-  const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
-                                  vreinterpret_s32_s16(b3.val[1]));
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
 
-  // Swap 64 bit elements resulting in:
-  // o0: 00 10 20 30 40 50 60 70
-  // o1: 01 11 21 31 41 51 61 71
-  // o2: 02 12 22 32 42 52 62 72
-  // o3: 03 13 23 33 43 53 63 73
-
-  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
-                     vreinterpret_s16_s32(c2.val[0]));
-  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
-                     vreinterpret_s16_s32(c3.val[0]));
-  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
-                     vreinterpret_s16_s32(c2.val[1]));
-  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
-                     vreinterpret_s16_s32(c3.val[1]));
+  *o0 = vreinterpretq_s16_s32(d0.val[0]);
+  *o1 = vreinterpretq_s16_s32(d1.val[0]);
+  *o2 = vreinterpretq_s16_s32(d0.val[1]);
+  *o3 = vreinterpretq_s16_s32(d1.val[1]);
 }
 
 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,

From ccc101e6bb63c2af340b993c57fad0f3810aee27 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 24 Feb 2023 18:05:43 +0000
Subject: [PATCH 554/926] Add Neon implementations of standard bitdepth MSE
 functions

Currently only vpx_mse16x16 has a Neon implementation. This patch adds
optimized Armv8.0 and Armv8.4 dot-product paths for all block sizes:
8x8, 8x16, 16x8 and 16x16.

Add the corresponding tests as well.

Change-Id: Ib0357fdcdeb05860385fec89633386e34395e260
---
 test/variance_test.cc        |   7 +-
 vpx_dsp/arm/variance_neon.c  | 182 +++++++++++++++++++++++------------
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   6 +-
 3 files changed, 127 insertions(+), 68 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 33f09209f4..a68cfad516 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -773,6 +773,7 @@ TEST_P(VpxSseTest, RefSse) { RefTestSse(); }
 TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); }
 TEST_P(VpxMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
 TEST_P(VpxVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
@@ -1450,8 +1451,10 @@ INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest,
                                                      &vpx_get4x4sse_cs_neon)));
 
 INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest,
-                         ::testing::Values(MseParams(4, 4,
-                                                     &vpx_mse16x16_neon)));
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon),
+                                           MseParams(4, 3, &vpx_mse16x8_neon),
+                                           MseParams(3, 4, &vpx_mse8x16_neon),
+                                           MseParams(3, 3, &vpx_mse8x8_neon)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxVarianceTest,
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 3ccc4e807b..feff980c93 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -371,32 +371,66 @@ VARIANCE_WXH_NEON(64, 64, 12)
 
 #if defined(__ARM_FEATURE_DOTPROD)
 
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
-                               const unsigned char *ref_ptr, int ref_stride,
-                               unsigned int *sse) {
-  int i;
-  uint8x16_t a[2], b[2], abs_diff[2];
-  uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  for (i = 0; i < 8; i++) {
-    a[0] = vld1q_u8(src_ptr);
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride, int h,
+                                           unsigned int *sse) {
+  uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    a[1] = vld1q_u8(src_ptr);
+    s1 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    b[0] = vld1q_u8(ref_ptr);
+    r0 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    b[1] = vld1q_u8(ref_ptr);
+    r1 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
 
-    abs_diff[0] = vabdq_u8(a[0], b[0]);
-    abs_diff[1] = vabdq_u8(a[1], b[1]);
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
 
-    sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]);
-    sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]);
-  }
+    sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
-  return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
+  *sse = horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
+  return *sse;
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+                                            int src_stride,
+                                            const unsigned char *ref_ptr,
+                                            int ref_stride, int h,
+                                            unsigned int *sse) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabdq_u8(s0, r0);
+    diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return *sse;
 }
 
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
@@ -435,58 +469,67 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
 
 #else  // !defined(__ARM_FEATURE_DOTPROD)
 
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
-                               const unsigned char *ref_ptr, int ref_stride,
-                               unsigned int *sse) {
-  int i;
-  uint8x16_t a[2], b[2];
-  int16x4_t diff_lo[4], diff_hi[4];
-  uint16x8_t diff[4];
-  int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
-                           vdupq_n_s32(0) };
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride, int h,
+                                           unsigned int *sse) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
-  for (i = 0; i < 8; i++) {
-    a[0] = vld1q_u8(src_ptr);
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+    uint16x8_t sse0, sse1;
+
+    s0 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    a[1] = vld1q_u8(src_ptr);
+    s1 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    b[0] = vld1q_u8(ref_ptr);
+    r0 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    b[1] = vld1q_u8(ref_ptr);
+    r1 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
 
-    diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0]));
-    diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0]));
-    diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1]));
-    diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1]));
-
-    diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
-    diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
-    sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]);
-    sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]);
-
-    diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
-    diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
-    sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]);
-    sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]);
-
-    diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
-    diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
-    sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]);
-    sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]);
-
-    diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
-    diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
-    sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]);
-    sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]);
-  }
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
+
+    sse0 = vmull_u8(diff0, diff0);
+    sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+    sse1 = vmull_u8(diff1, diff1);
+    sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+  } while (--i != 0);
+
+  *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return *sse;
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+                                            int src_stride,
+                                            const unsigned char *ref_ptr,
+                                            int ref_stride, int h,
+                                            unsigned int *sse) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s, r, diff;
+    uint16x8_t sse0, sse1;
 
-  sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]);
-  sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]);
-  sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]);
+    s = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    r = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
 
-  *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
-  return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
+    diff = vabdq_u8(s, r);
+
+    sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff));
+    sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+    sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff));
+    sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+  } while (--i != 0);
+
+  *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return *sse;
 }
 
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
@@ -531,3 +574,16 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
 }
 
 #endif  // defined(__ARM_FEATURE_DOTPROD)
+
+#define VPX_MSE_WXH_NEON(w, h)                                              \
+  unsigned int vpx_mse##w##x##h##_neon(                                     \
+      const unsigned char *src_ptr, int src_stride,                         \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) {    \
+    return vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h, \
+                               sse);                                        \
+  }
+
+VPX_MSE_WXH_NEON(8, 8)
+VPX_MSE_WXH_NEON(8, 16)
+VPX_MSE_WXH_NEON(16, 8)
+VPX_MSE_WXH_NEON(16, 16)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index eef72249e0..0ad3cbe6b2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1141,13 +1141,13 @@ ()
   specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/;
+  specialize qw/vpx_mse16x8 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa mmi vsx/;
+  specialize qw/vpx_mse8x16 sse2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa mmi vsx/;
+  specialize qw/vpx_mse8x8 sse2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;

From 112945ac7b5784c05912d1955afd2c245ce5c51d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 27 Feb 2023 13:48:47 -0800
Subject: [PATCH 555/926] tools_common,VpxInterface: remove unneeded const

Change-Id: Ic309aab2ff1750bdbcc36e8aafe05d52930ba694
---
 tools_common.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools_common.h b/tools_common.h
index 3a266416e3..9850907c15 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -145,9 +145,9 @@ VPX_NO_RETURN void usage_exit(void);
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
 typedef struct VpxInterface {
-  const char *const name;
-  const uint32_t fourcc;
-  vpx_codec_iface_t *(*const codec_interface)(void);
+  const char *name;
+  uint32_t fourcc;
+  vpx_codec_iface_t *(*codec_interface)(void);
 } VpxInterface;
 
 int get_vpx_encoder_count(void);

From 848f6e733789c627b6606baf1c85e32be997e36f Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Sat, 5 Nov 2022 09:53:07 +0900
Subject: [PATCH 556/926] quantize: simplify 32x32_b args

Now that all the implementations of the 32x32 quantize are in
intrinsics we can reference struct members directly. Saves
pushing them to the stack.

n_coeffs is not used at all for this function.

Change-Id: I2104fea3fa20c455087e21b347d6abd7ea1f3e1e
---
 test/vp9_quantize_test.cc    | 285 +++++++++++++++++++++--------------
 vp9/encoder/vp9_block.h      |   1 +
 vp9/encoder/vp9_encodemb.c   |   6 +-
 vpx_dsp/arm/quantize_neon.c  |  17 +--
 vpx_dsp/quantize.c           |  17 ++-
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   5 +-
 vpx_dsp/x86/quantize_avx.c   |  30 +---
 vpx_dsp/x86/quantize_avx2.c  |  15 +-
 vpx_dsp/x86/quantize_sse2.h  |  28 ++++
 vpx_dsp/x86/quantize_ssse3.c |  35 +----
 10 files changed, 238 insertions(+), 201 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 587cec6923..ecb6116f0c 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -26,6 +26,7 @@
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
@@ -38,8 +39,7 @@ namespace {
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             const int16_t *zbin, const int16_t *round,
-                             const int16_t *quant, const int16_t *quant_shift,
+                             const macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
                              const int16_t *scan, const int16_t *iscan);
@@ -47,6 +47,41 @@ typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
+// Wrapper which takes a macroblock_plane.
+typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count,
+                                 const int16_t *zbin, const int16_t *round,
+                                 const int16_t *quant,
+                                 const int16_t *quant_shift, tran_low_t *qcoeff,
+                                 tran_low_t *dqcoeff, const int16_t *dequant,
+                                 uint16_t *eob, const int16_t *scan,
+                                 const int16_t *iscan);
+
+template <QuantizeBaseFunc fn>
+void QuantWrapper(const tran_low_t *coeff, intptr_t count,
+                  const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                  tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                  const int16_t *scan, const int16_t *iscan) {
+  fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
+     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
+// Wrapper for 32x32 version which does not use count
+typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
+                                  const macroblock_plane *const mb_plane,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  const int16_t *dequant, uint16_t *eob,
+                                  const int16_t *scan, const int16_t *iscan);
+
+template <Quantize32x32Func fn>
+void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
+                       const macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                       const int16_t *dequant, uint16_t *eob,
+                       const int16_t *scan, const int16_t *iscan) {
+  (void)count;
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
                                const int16_t *round, const int16_t *quant,
@@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
 
 template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
-                    const int16_t *zbin, const int16_t *round,
-                    const int16_t *quant, const int16_t *quant_shift,
-                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
-                    const int16_t *iscan) {
-  (void)zbin;
-  (void)quant_shift;
-
-  fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+                    const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                    tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                    const int16_t *scan, const int16_t *iscan) {
+  fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
+     dequant, eob, scan, iscan);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -119,17 +150,16 @@ class VP9QuantizeBase : public AbstractBench {
 #else
     max_value_ = (1 << bit_depth_) - 1;
 #endif
-    zbin_ptr_ =
+    zbin_ptr_ = mb_plane_.zbin =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
-    round_fp_ptr_ = reinterpret_cast<int16_t *>(
-        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
-    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+    round_fp_ptr_ = mb_plane_.round_fp;
+    quant_fp_ptr_ = mb_plane_.quant_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
-    round_ptr_ =
+    round_ptr_ = mb_plane_.round =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
-    quant_ptr_ =
+    quant_ptr_ = mb_plane_.quant =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
-    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+    quant_shift_ptr_ = mb_plane_.quant_shift = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
     dequant_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
@@ -140,7 +170,6 @@ class VP9QuantizeBase : public AbstractBench {
 
   ~VP9QuantizeBase() {
     vpx_free(zbin_ptr_);
-    vpx_free(round_fp_ptr_);
     vpx_free(quant_fp_ptr_);
     vpx_free(round_ptr_);
     vpx_free(quant_ptr_);
@@ -157,6 +186,7 @@ class VP9QuantizeBase : public AbstractBench {
   }
 
  protected:
+  macroblock_plane mb_plane_;
   int16_t *zbin_ptr_;
   int16_t *round_fp_ptr_;
   int16_t *quant_fp_ptr_;
@@ -193,10 +223,9 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 };
 
 void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-               quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
-               scan_->iscan);
+  quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+               qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_,
+               &eob_, scan_->scan, scan_->iscan);
 }
 
 void VP9QuantizeTest::Speed(bool is_median) {
@@ -266,8 +295,8 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&timer);
         for (int n = 0; n < kNumTests; ++n) {
-          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
-                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+                           ref_qcoeff.TopLeftPixel(),
                            ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                            scan_->scan, scan_->iscan);
         }
@@ -275,10 +304,9 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&simd_timer);
         for (int n = 0; n < kNumTests; ++n) {
-          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
-                       scan_->scan, scan_->iscan);
+          quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+                       qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+                       dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
         }
         vpx_usec_timer_mark(&simd_timer);
 
@@ -417,15 +445,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          &mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -475,15 +502,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          &mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -510,28 +536,35 @@ using std::make_tuple;
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
+        make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                    &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true)));
@@ -541,11 +574,12 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_ssse3>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true),
@@ -555,13 +589,14 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
-INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
-                         ::testing::Values(make_tuple(&vpx_quantize_b_avx,
-                                                      &vpx_quantize_b_c,
-                                                      VPX_BITS_8, 16, false),
-                                           make_tuple(&vpx_quantize_b_32x32_avx,
-                                                      &vpx_quantize_b_32x32_c,
-                                                      VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_avx>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
@@ -577,22 +612,29 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
                    &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
                    32, true),
-        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
@@ -602,11 +644,12 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true),
-                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_avx2,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
+                      make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
@@ -615,22 +658,29 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
@@ -639,11 +689,12 @@ INSTANTIATE_TEST_SUITE_P(
 #else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_neon,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                  16, true),
@@ -683,9 +734,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32, false),
+        make_tuple(&QuantWrapper<vpx_quantize_b_c>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 1786952911..fc27a0fbda 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,6 +13,7 @@
 
 #include "vpx_util/vpx_thread.h"
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9dcf..4910dc20f5 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                           p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+      vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
                            scan_order->scan, scan_order->iscan);
       break;
     case TX_16X16:
@@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+        vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
                              scan_order->scan, scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index 9c227d560f..e81738a7bb 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                const int16x8_t dequant,
@@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
@@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int i;
 
   // Only the first element of each vector is DC.
-  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  int16x8_t quant = vld1q_s16(quant_ptr);
-  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
@@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif  // __aarch64__
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
-  (void)n_coeffs;
   (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 5d6ba64a8a..212db45c88 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,6 +14,7 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
@@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 #endif
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
   (void)iscan;
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index eef72249e0..639c18bc98 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -17,6 +17,9 @@ ()
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+#endif
 
 EOF
 }
@@ -717,7 +720,7 @@ ()
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 7d83527216..d52f6c6644 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -140,15 +140,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *zbin_ptr, const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
                               const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
 
@@ -160,26 +157,9 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  shift = _mm_slli_epi16(shift, 1);
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 28f7c9c7da..a8412c5b8e 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
     const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
@@ -250,23 +251,19 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
   }
 }
 
-void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  (void)n_coeffs;
   (void)scan;
 
-  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
-                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
-                     &v_quant_shift, 1);
+  load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
+                     mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
+                     mb_plane->quant_shift, &v_quant_shift, 1);
 
   // Do DC and first 15 AC.
   v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 27bfb4e41b..fe42fee018 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,6 +15,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
                                  const int16_t *round_ptr, __m128i *round,
@@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
   *shift = _mm_load_si128((const __m128i *)shift_ptr);
 }
 
+static INLINE void load_b_values32x32(
+    const struct macroblock_plane *const mb_plane, __m128i *zbin,
+    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+    __m128i *dequant, __m128i *shift) {
+  const __m128i one = _mm_set1_epi16(1);
+  // The 32x32 halves zbin and round.
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  // Shift with rounding.
+  *zbin = _mm_add_epi16(*zbin, one);
+  *zbin = _mm_srli_epi16(*zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  *zbin = _mm_sub_epi16(*zbin, one);
+
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *round = _mm_add_epi16(*round, one);
+  *round = _mm_srli_epi16(*round, 1);
+
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  *shift = _mm_slli_epi16(*shift, 1);
+}
+
 static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
                                   const int16_t *quant_ptr, __m128i *quant,
                                   const int16_t *dequant_ptr,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 476230286d..6fe54d7d98 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,6 +16,7 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   int index;
 
   __m128i zbin, round, quant, dequant, shift;
@@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  // I suspect this is not technically OK because quant_shift can be up
-  // to 1 << 16 and shifting up again will outrange that, but the test is not
-  // comprehensive enough to catch that and "it's been that way forever"
-  shift = _mm_slli_epi16(shift, 1);
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);

From a7ab16aed1d75869c5fd096374a91698b419c1a7 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 3 Feb 2023 17:12:46 +0000
Subject: [PATCH 557/926] Implement d63_predictor using Neon

Add Neon implementations of the d63 predictor for 4x4, 8x8, 16x16 and
32x32 block sizes. Also update tests to add new corresponding cases.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    2.10
Neoverse N1 |  LLVM 15 |   8x8 |    4.45
Neoverse N1 |  LLVM 15 | 16x16 |    4.74
Neoverse N1 |  LLVM 15 | 32x32 |    2.27
Neoverse N1 |   GCC 12 |   4x4 |    2.46
Neoverse N1 |   GCC 12 |   8x8 |   10.37
Neoverse N1 |   GCC 12 | 16x16 |   11.46
Neoverse N1 |   GCC 12 | 32x32 |    6.57
Neoverse V1 |  LLVM 15 |   4x4 |    2.24
Neoverse V1 |  LLVM 15 |   8x8 |    3.53
Neoverse V1 |  LLVM 15 | 16x16 |    4.44
Neoverse V1 |  LLVM 15 | 32x32 |    2.17
Neoverse V1 |   GCC 12 |   4x4 |    2.25
Neoverse V1 |   GCC 12 |   8x8 |    7.67
Neoverse V1 |   GCC 12 | 16x16 |    8.97
Neoverse V1 |   GCC 12 | 32x32 |    4.77

Change-Id: Ib4a1a2cb5a5c4495ae329529f8847664cbd0dfe0
---
 test/test_intra_pred_speed.cc |  12 +--
 test/vp9_intrapred_test.cc    |   8 ++
 vpx_dsp/arm/intrapred_neon.c  | 162 ++++++++++++++++++++++++++++++++++
 vpx_dsp/arm/mem_neon.h        |  15 ++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |   8 +-
 5 files changed, 195 insertions(+), 10 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 28b3484a03..df01ccac22 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -269,28 +269,28 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr,
-                vpx_tm_predictor_4x4_neon)
+                vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr,
+                vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
-                vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr,
-                vpx_tm_predictor_8x8_neon)
+                vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr,
+                vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
                 vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr,
-                nullptr, vpx_tm_predictor_16x16_neon)
+                vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
                 vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr,
-                nullptr, vpx_tm_predictor_32x32_neon)
+                vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index ccace719ea..12a227b12c 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -243,6 +243,14 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d45_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d45_predictor_32x32_neon,
                        &vpx_d45_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d63_predictor_4x4_neon, &vpx_d63_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_8x8_neon, &vpx_d63_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_16x16_neon,
+                       &vpx_d63_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d63_predictor_32x32_neon,
+                       &vpx_d63_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
                        4, 8),
         IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 38e275834b..02a05aae53 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "mem_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
@@ -383,6 +384,167 @@ void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3;
+  (void)left;
+
+  a0 = load_unaligned_u8_4x1(above + 0);
+  a1 = load_unaligned_u8_4x1(above + 1);
+  a2 = load_unaligned_u8_4x1(above + 2);
+  a3 = load_unaligned_u8_4x1(above + 3);
+
+  d0 = vrhadd_u8(a0, a1);
+  d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+  d2 = vrhadd_u8(a1, a2);
+  d3 = vrhadd_u8(vhadd_u8(a1, a3), a2);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a0, a1, a2, a7, d0, d1;
+  (void)left;
+
+  a0 = vld1_u8(above + 0);
+  a1 = vld1_u8(above + 1);
+  a2 = vld1_u8(above + 2);
+  a7 = vld1_dup_u8(above + 7);
+
+  d0 = vrhadd_u8(a0, a1);
+  d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 1));
+  vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 1));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 2));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 3));
+}
+
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a0, a1, a2, a15, d0, d1;
+  (void)left;
+
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a2 = vld1q_u8(above + 2);
+  a15 = vld1q_dup_u8(above + 15);
+
+  d0 = vrhaddq_u8(a0, a1);
+  d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 1));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 1));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 2));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 3));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 4));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 5));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 6));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 7));
+}
+
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)left;
+
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a2 = vld1q_u8(above + 2);
+  a16 = vld1q_u8(above + 16);
+  a17 = vld1q_u8(above + 17);
+  a18 = vld1q_u8(above + 18);
+  a31 = vld1q_dup_u8(above + 31);
+
+  d0_lo = vrhaddq_u8(a0, a1);
+  d0_hi = vrhaddq_u8(a16, a17);
+  d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17);
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 1));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 1));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 1));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 1));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 2));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 2));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 3));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 3));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 4));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 4));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 5));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 5));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 6));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 6));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 7));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 7));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 8));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 8));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 9));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 9));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 10));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 10));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 11));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 11));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 12));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 12));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 13));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 13));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 14));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 14));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_hi, a31, 15));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_hi, a31, 15));
+}
+
+// -----------------------------------------------------------------------------
+
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x8_t XA0123 = vld1_u8(above - 1);
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 866be7439e..d1ea361896 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -112,6 +112,21 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
   memcpy(buf, &a, 4);
 }
 
+// Load 4 contiguous bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  uint32x2_t a_u32;
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(0);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Store 4 contiguous bytes from the low half of an 8x8 vector.
+static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
+  vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
+}
+
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
                                           ptrdiff_t stride) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8725821b67..9ee9fc1c4f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -46,7 +46,7 @@ ()
 add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_4x4 ssse3/;
+specialize qw/vpx_d63_predictor_4x4 neon ssse3/;
 
 add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
@@ -94,7 +94,7 @@ ()
 
 add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
-specialize qw/vpx_d63_predictor_8x8 ssse3/;
+specialize qw/vpx_d63_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
@@ -135,7 +135,7 @@ ()
 specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
 
 add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
+specialize qw/vpx_d63_predictor_16x16 neon ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
@@ -173,7 +173,7 @@ ()
 specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
 
 add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
+specialize qw/vpx_d63_predictor_32x32 neon ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;

From 360e9069b6cc1dd3a004728b876fb923413f4b11 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 7 Feb 2023 12:16:00 +0000
Subject: [PATCH 558/926] Implement d117_predictor using Neon

Add Neon implementations of the d117 predictor for 4x4, 8x8, 16x16 and
32x32 block sizes. Also update tests to add new corresponding cases.

An explanation of the general implementation strategy is given in the
8x8 implementation body.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    1.73
Neoverse N1 |  LLVM 15 |   8x8 |    5.24
Neoverse N1 |  LLVM 15 | 16x16 |    9.77
Neoverse N1 |  LLVM 15 | 32x32 |   14.13
Neoverse N1 |   GCC 12 |   4x4 |    2.04
Neoverse N1 |   GCC 12 |   8x8 |    4.70
Neoverse N1 |   GCC 12 | 16x16 |    8.64
Neoverse N1 |   GCC 12 | 32x32 |    4.57
Neoverse V1 |  LLVM 15 |   4x4 |    1.75
Neoverse V1 |  LLVM 15 |   8x8 |    6.79
Neoverse V1 |  LLVM 15 | 16x16 |    9.16
Neoverse V1 |  LLVM 15 | 32x32 |   14.47
Neoverse V1 |   GCC 12 |   4x4 |    1.75
Neoverse V1 |   GCC 12 |   8x8 |    6.00
Neoverse V1 |   GCC 12 | 16x16 |    7.63
Neoverse V1 |   GCC 12 | 32x32 |    4.32

Change-Id: I7228327b5be27ee7a68deecafa05be0bd2a40ff4
---
 test/test_intra_pred_speed.cc |  20 +--
 test/vp9_intrapred_test.cc    |   8 ++
 vpx_dsp/arm/intrapred_neon.c  | 232 ++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |   4 +
 4 files changed, 256 insertions(+), 8 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index df01ccac22..5861a17770 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -269,28 +269,32 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr,
-                vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
+                vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon,
+                nullptr, nullptr, vpx_d63_predictor_4x4_neon,
+                vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
-                vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr,
-                vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
+                vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon,
+                nullptr, nullptr, vpx_d63_predictor_8x8_neon,
+                vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
-                vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr,
-                vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
+                vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon,
+                nullptr, nullptr, vpx_d63_predictor_16x16_neon,
+                vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
-                vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr,
-                vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
+                vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon,
+                nullptr, nullptr, vpx_d63_predictor_32x32_neon,
+                vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 12a227b12c..d04be429d1 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -251,6 +251,14 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d63_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d63_predictor_32x32_neon,
                        &vpx_d63_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d117_predictor_16x16_neon,
+                       &vpx_d117_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d117_predictor_32x32_neon,
+                       &vpx_d117_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
                        4, 8),
         IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 02a05aae53..4760a295b9 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -545,6 +545,238 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
+  col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
+
+  d0 = vrhadd_u8(az, a0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vext_u8(col0, d0, 7);
+  d3 = vext_u8(col1, d1, 7);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left + 0);
+  l1 = vld1_u8(left + 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // d0[1] = AVG2(above[0], above[1])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhadd_u8(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[0] = AVG3(left[6], left[7], left[8])
+  col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzp_u8(col0, col0).val[1];
+  col0_odd = vuzp_u8(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+  vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
+  vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
+  vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
+  vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
+  vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
+}
+
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  l1 = vld1q_u8(left + 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(az, a0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+
+  col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
+
+  col0_even = vuzpq_u8(col0, col0).val[1];
+  col0_odd = vuzpq_u8(col0, col0).val[0];
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+  vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
+  vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
+  vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
+  vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
+  vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
+  vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
+  vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
+  vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
+  vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
+  vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
+  vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
+  vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
+}
+
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
+      l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  l17 = vld1q_u8(left + 17);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(az, a0);
+  d0_hi = vrhaddq_u8(a15, a16);
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
+  col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
+
+  col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+}
+
+// -----------------------------------------------------------------------------
+
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x8_t XA0123 = vld1_u8(above - 1);
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 9ee9fc1c4f..980380325a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -57,6 +57,7 @@ ()
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
@@ -101,6 +102,7 @@ ()
 specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_8x8 neon/;
@@ -141,6 +143,7 @@ ()
 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_16x16 neon/;
@@ -179,6 +182,7 @@ ()
 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_32x32 neon/;

From 7cdf139e3d6237386e0f93bdb0bdc1b459c663bf Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Mon, 20 Feb 2023 11:41:40 +0000
Subject: [PATCH 559/926] Implement highbd_d63_predictor using Neon

Add Neon implementations of the highbd d63 predictor for 4x4, 8x8, 16x16
and 32x32 block sizes. Also update tests to add new corresponding cases.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    2.43
Neoverse N1 |  LLVM 15 |   8x8 |    4.03
Neoverse N1 |  LLVM 15 | 16x16 |    3.07
Neoverse N1 |  LLVM 15 | 32x32 |    4.11
Neoverse N1 |   GCC 12 |   4x4 |    2.92
Neoverse N1 |   GCC 12 |   8x8 |    7.20
Neoverse N1 |   GCC 12 | 16x16 |    4.43
Neoverse N1 |   GCC 12 | 32x32 |    3.18
Neoverse V1 |  LLVM 15 |   4x4 |    1.99
Neoverse V1 |  LLVM 15 |   8x8 |    3.66
Neoverse V1 |  LLVM 15 | 16x16 |    3.60
Neoverse V1 |  LLVM 15 | 32x32 |    3.29
Neoverse V1 |   GCC 12 |   4x4 |    2.39
Neoverse V1 |   GCC 12 |   8x8 |    4.76
Neoverse V1 |   GCC 12 | 16x16 |    3.29
Neoverse V1 |   GCC 12 | 32x32 |    2.43

Change-Id: Ic59df16ceeb468003754b4374be2f4d9af6589e4
---
 test/test_intra_pred_speed.cc       |  44 ++---
 test/vp9_intrapred_test.cc          |  24 +++
 vpx_dsp/arm/highbd_intrapred_neon.c | 278 ++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl        |   8 +-
 4 files changed, 326 insertions(+), 28 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 5861a17770..19dabf88a7 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -565,35 +565,31 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
-    vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr,
-    vpx_highbd_tm_predictor_4x4_neon)
+    vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr,
+    vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
-    vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr,
-    vpx_highbd_tm_predictor_8x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
-                       vpx_highbd_dc_predictor_16x16_neon,
-                       vpx_highbd_dc_left_predictor_16x16_neon,
-                       vpx_highbd_dc_top_predictor_16x16_neon,
-                       vpx_highbd_dc_128_predictor_16x16_neon,
-                       vpx_highbd_v_predictor_16x16_neon,
-                       vpx_highbd_h_predictor_16x16_neon,
-                       vpx_highbd_d45_predictor_16x16_neon,
-                       vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr,
-                       nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
-                       vpx_highbd_dc_predictor_32x32_neon,
-                       vpx_highbd_dc_left_predictor_32x32_neon,
-                       vpx_highbd_dc_top_predictor_32x32_neon,
-                       vpx_highbd_dc_128_predictor_32x32_neon,
-                       vpx_highbd_v_predictor_32x32_neon,
-                       vpx_highbd_h_predictor_32x32_neon,
-                       vpx_highbd_d45_predictor_32x32_neon,
-                       vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr,
-                       nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon)
+    vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr,
+    vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
+    vpx_highbd_dc_left_predictor_16x16_neon,
+    vpx_highbd_dc_top_predictor_16x16_neon,
+    vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
+    vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
+    vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr, nullptr,
+    vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
+    vpx_highbd_dc_left_predictor_32x32_neon,
+    vpx_highbd_dc_top_predictor_32x32_neon,
+    vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
+    vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
+    vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr, nullptr,
+    vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index d04be429d1..139358c307 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -848,6 +848,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -924,6 +932,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -1000,6 +1016,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 6f7e5da762..18dca81100 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -453,6 +453,284 @@ void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x4_t a0, a1, a2, a3, d0, d1;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1_u16(above + 0);
+  a1 = vld1_u16(above + 1);
+  a2 = vld1_u16(above + 2);
+  a3 = vld1_dup_u16(above + 3);
+
+  d0 = vrhadd_u16(a0, a1);
+  d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
+
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, vext_u16(d0, a3, 1));
+  vst1_u16(dst + 3 * stride, vext_u16(d1, a3, 1));
+}
+
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x8_t a0, a1, a2, a7, d0, d1;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a7 = vld1q_dup_u16(above + 7);
+
+  d0 = vrhaddq_u16(a0, a1);
+  d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 1));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d1, a7, 1));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 2));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d1, a7, 2));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 3));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d1, a7, 3));
+}
+
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a15 = vld1q_dup_u16(above + 15);
+
+  d0_lo = vrhaddq_u16(a0, a1);
+  d0_hi = vrhaddq_u16(a8, a9);
+  d1_lo = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1_hi = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+
+  vst1q_u16(dst + 0 * stride + 0, d0_lo);
+  vst1q_u16(dst + 0 * stride + 8, d0_hi);
+  vst1q_u16(dst + 1 * stride + 0, d1_lo);
+  vst1q_u16(dst + 1 * stride + 8, d1_hi);
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0_lo, d0_hi, 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_hi, a15, 1));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1_lo, d1_hi, 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_hi, a15, 1));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0_lo, d0_hi, 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_hi, a15, 2));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1_lo, d1_hi, 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_hi, a15, 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0_lo, d0_hi, 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_hi, a15, 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1_lo, d1_hi, 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_hi, a15, 3));
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0_lo, d0_hi, 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_hi, a15, 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1_lo, d1_hi, 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_hi, a15, 4));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0_lo, d0_hi, 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_hi, a15, 5));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1_lo, d1_hi, 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_hi, a15, 5));
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0_lo, d0_hi, 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_hi, a15, 6));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1_lo, d1_hi, 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_hi, a15, 6));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0_lo, d0_hi, 7));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_hi, a15, 7));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1_lo, d1_hi, 7));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_hi, a15, 7));
+}
+
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
+      d1[4];
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a18 = vld1q_u16(above + 18);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a26 = vld1q_u16(above + 26);
+  a31 = vld1q_dup_u16(above + 31);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d0[2] = vrhaddq_u16(a16, a17);
+  d0[3] = vrhaddq_u16(a24, a25);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[3], a31, 1));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[3], a31, 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[3], a31, 2));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[3], a31, 2));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[3], a31, 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[3], a31, 3));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[3], a31, 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[3], a31, 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[3], a31, 5));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[3], a31, 5));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[3], a31, 6));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[3], a31, 6));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[3], a31, 7));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[3], a31, 7));
+
+  vst1q_u16(dst + 16 * stride + 0, d0[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[2]);
+  vst1q_u16(dst + 16 * stride + 16, d0[3]);
+  vst1q_u16(dst + 16 * stride + 24, a31);
+  vst1q_u16(dst + 17 * stride + 0, d1[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[2]);
+  vst1q_u16(dst + 17 * stride + 16, d1[3]);
+  vst1q_u16(dst + 17 * stride + 24, a31);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[3], a31, 1));
+  vst1q_u16(dst + 18 * stride + 24, a31);
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[3], a31, 1));
+  vst1q_u16(dst + 19 * stride + 24, a31);
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[3], a31, 2));
+  vst1q_u16(dst + 20 * stride + 24, a31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[3], a31, 2));
+  vst1q_u16(dst + 21 * stride + 24, a31);
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[3], a31, 3));
+  vst1q_u16(dst + 22 * stride + 24, a31);
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[3], a31, 3));
+  vst1q_u16(dst + 23 * stride + 24, a31);
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[3], a31, 4));
+  vst1q_u16(dst + 24 * stride + 24, a31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[3], a31, 4));
+  vst1q_u16(dst + 25 * stride + 24, a31);
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[3], a31, 5));
+  vst1q_u16(dst + 26 * stride + 24, a31);
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[3], a31, 5));
+  vst1q_u16(dst + 27 * stride + 24, a31);
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[3], a31, 6));
+  vst1q_u16(dst + 28 * stride + 24, a31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[3], a31, 6));
+  vst1q_u16(dst + 29 * stride + 24, a31);
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[3], a31, 7));
+  vst1q_u16(dst + 30 * stride + 24, a31);
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[3], a31, 7));
+  vst1q_u16(dst + 31 * stride + 24, a31);
+}
+
+// -----------------------------------------------------------------------------
+
 void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 980380325a..71c3a84638 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -217,7 +217,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
@@ -256,7 +256,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
@@ -295,7 +295,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
@@ -334,7 +334,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;

From 74e4587c89d38ce6922171e9f65b8cff5f2ffcc8 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 21 Feb 2023 11:17:10 +0000
Subject: [PATCH 560/926] Implement highbd_d117_predictor using Neon

Add Neon implementations of the highbd d117 predictor for 4x4, 8x8,
16x16 and 32x32 block sizes. Also update tests to add new corresponding
cases.

An explanation of the general implementation strategy is given in the
8x8 implementation body, and is mostly identical to the non-highbd
version.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    1.99
Neoverse N1 |  LLVM 15 |   8x8 |    4.37
Neoverse N1 |  LLVM 15 | 16x16 |    6.81
Neoverse N1 |  LLVM 15 | 32x32 |    6.49
Neoverse N1 |   GCC 12 |   4x4 |    2.49
Neoverse N1 |   GCC 12 |   8x8 |    4.10
Neoverse N1 |   GCC 12 | 16x16 |    5.58
Neoverse N1 |   GCC 12 | 32x32 |    2.16
Neoverse V1 |  LLVM 15 |   4x4 |    1.99
Neoverse V1 |  LLVM 15 |   8x8 |    5.03
Neoverse V1 |  LLVM 15 | 16x16 |    6.61
Neoverse V1 |  LLVM 15 | 32x32 |    6.01
Neoverse V1 |   GCC 12 |   4x4 |    2.09
Neoverse V1 |   GCC 12 |   8x8 |    4.52
Neoverse V1 |   GCC 12 | 16x16 |    4.23
Neoverse V1 |   GCC 12 | 32x32 |    2.70

Change-Id: I892fbd2c17ac527ddc22b91acca907ffc84c5cd2
---
 test/test_intra_pred_speed.cc       |  20 +-
 test/vp9_intrapred_test.cc          |  24 ++
 vpx_dsp/arm/highbd_intrapred_neon.c | 382 ++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl        |   8 +-
 4 files changed, 422 insertions(+), 12 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 19dabf88a7..e721a459ad 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -565,31 +565,35 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
-    vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr,
-    vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon)
+    vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon,
+    nullptr, nullptr, vpx_highbd_d63_predictor_4x4_neon,
+    vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
-    vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr,
-    vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon)
+    vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon,
+    nullptr, nullptr, vpx_highbd_d63_predictor_8x8_neon,
+    vpx_highbd_tm_predictor_8x8_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
     vpx_highbd_dc_left_predictor_16x16_neon,
     vpx_highbd_dc_top_predictor_16x16_neon,
     vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
     vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
-    vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr, nullptr,
-    vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon)
+    vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon,
+    nullptr, nullptr, vpx_highbd_d63_predictor_16x16_neon,
+    vpx_highbd_tm_predictor_16x16_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
     vpx_highbd_dc_left_predictor_32x32_neon,
     vpx_highbd_dc_top_predictor_32x32_neon,
     vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
     vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
-    vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr, nullptr,
-    vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon)
+    vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon,
+    nullptr, nullptr, vpx_highbd_d63_predictor_32x32_neon,
+    vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 139358c307..c4e0e78ac5 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -856,6 +856,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d63_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
                              &vpx_highbd_d63_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -940,6 +948,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d63_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
                              &vpx_highbd_d63_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -1024,6 +1040,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d63_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
                              &vpx_highbd_d63_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 18dca81100..424bf5f4bc 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -731,6 +731,388 @@ void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+  (void)bd;
+
+  az = vld1_u16(above - 1);
+  a0 = vld1_u16(above + 0);
+  // [ left[0], above[-1], above[0], above[1] ]
+  l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+  l0 = vld1_u16(left + 0);
+  l1 = vld1_u16(left + 1);
+  // [ above[-1], left[0], left[1], left[2] ]
+  azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+  d0 = vrhadd_u16(az, a0);
+  d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+
+  col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+  col0_even = vdup_lane_u16(col0, 0);
+  col0_odd = vdup_lane_u16(col0, 1);
+
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3));
+  vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3));
+}
+
+void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhaddq_u16(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[0] = AVG3(left[6], left[7], left[8])
+  col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0 = vrev64q_u16(vextq_u16(col0, col0, 4));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzpq_u16(col0, col0).val[1];
+  col0_odd = vuzpq_u16(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7));
+  vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7));
+  vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6));
+  vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6));
+  vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5));
+  vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5));
+}
+
+void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo,
+      col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0_lo = vrhaddq_u16(az, a0);
+  d0_hi = vrhaddq_u16(a7, a8);
+  d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+
+  col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+  // Reverse within each vector, then swap the array indices in the uzp to
+  // complete the reversal across all 16 elements.
+  col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4));
+  col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4));
+  col0_even = vuzpq_u16(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0];
+
+  vst1q_u16(dst + 0 * stride + 0, d0_lo);
+  vst1q_u16(dst + 0 * stride + 8, d0_hi);
+  vst1q_u16(dst + 1 * stride + 0, d1_lo);
+  vst1q_u16(dst + 1 * stride + 8, d1_hi);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1));
+}
+
+void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+      l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4],
+      col0_even[2], col0_odd[2];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a14 = vld1q_u16(above + 14);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a22 = vld1q_u16(above + 22);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l15 = vld1q_u16(left + 15);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l23 = vld1q_u16(left + 23);
+  l24 = vld1q_u16(left + 24);
+  l25 = vld1q_u16(left + 25);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(az, a0);
+  d0[1] = vrhaddq_u16(a7, a8);
+  d0[2] = vrhaddq_u16(a15, a16);
+  d0[3] = vrhaddq_u16(a23, a24);
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+  col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+  col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+  col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+  // Reverse within each vector, then swap the array indices in both the uzp
+  // and the col0_{even,odd} assignment to complete the reversal across all
+  // 32-elements.
+  col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4));
+  col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4));
+  col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4));
+  col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4));
+
+  col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1];
+  col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1];
+  col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0];
+  col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0];
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+  vst1q_u16(dst + 16 * stride + 0, col0_even[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[0]);
+  vst1q_u16(dst + 16 * stride + 16, d0[1]);
+  vst1q_u16(dst + 16 * stride + 24, d0[2]);
+  vst1q_u16(dst + 17 * stride + 0, col0_odd[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[0]);
+  vst1q_u16(dst + 17 * stride + 16, d1[1]);
+  vst1q_u16(dst + 17 * stride + 24, d1[2]);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6));
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4));
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2));
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+}
+
+// -----------------------------------------------------------------------------
+
 void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 71c3a84638..26b723f055 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -223,7 +223,7 @@ ()
   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d117_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
@@ -262,7 +262,7 @@ ()
   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
@@ -301,7 +301,7 @@ ()
   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
@@ -340,7 +340,7 @@ ()
   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;

From 573f5e662b544dbc553d73fa2b61055c30dfe8cc Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Sat, 12 Nov 2022 08:23:17 +0900
Subject: [PATCH 561/926] quantize: simplifly highbd 32x32_b args

Change-Id: I431a41279c4c4193bc70cfe819da6ea7e1d2fba1
---
 test/vp9_quantize_test.cc                 | 54 +++++++++++------------
 vp9/encoder/vp9_encodemb.c                | 10 ++---
 vpx_dsp/arm/highbd_quantize_neon.c        | 21 +++++----
 vpx_dsp/quantize.c                        | 16 ++++---
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  2 +-
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 13 +++---
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 21 ++++-----
 7 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index ecb6116f0c..e533b2509c 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -549,15 +549,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
                    &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
                    false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false)));
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
@@ -626,15 +626,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false)));
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
@@ -672,15 +672,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4910dc20f5..6a5f628808 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -511,9 +511,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(
-            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                    scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
@@ -856,9 +855,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(
-              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
+                                      eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index b9f72a94c5..3b1fec3321 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
     const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
@@ -224,11 +225,9 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
 }
 
 void vpx_highbd_quantize_b_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
@@ -237,12 +236,13 @@ void vpx_highbd_quantize_b_32x32_neon(
   // High half has identical elements, but we can reconstruct it from the low
   // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
   // vector
-  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1);
-  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1);
+  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
+  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
   // Extend the quant, quant_shift vectors to ones of 32-bit elements
   // scale to high-half, so we can use vqdmulhq_s32
-  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
-  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16);
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
   int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
 
   // Process first 8 values which include a dc component.
@@ -300,8 +300,7 @@ void vpx_highbd_quantize_b_32x32_neon(
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
-  // Need these here, else the compiler complains about mixing declarations and
+  // Need this here, else the compiler complains about mixing declarations and
   // code in C90
-  (void)n_coeffs;
   (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 212db45c88..c4642812ad 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -272,14 +272,16 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+  const intptr_t n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   int idx = 0;
   int idx_arr[1024];
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 6ac8c982ad..a326e2b34c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -731,7 +731,7 @@ ()
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index 8edddd637f..6041d7289a 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,6 +11,7 @@
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
   const __m128i sign = _mm_srai_epi16(*p, 15);
@@ -222,17 +223,17 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
 }
 
 void vpx_highbd_quantize_b_32x32_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   const unsigned int step = 8;
+  intptr_t n_coeffs = 32 * 32;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
   (void)scan;
 
-  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+  init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
+          mb_plane->quant_shift, qp, 1);
 
   quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
 
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index ae1981a834..6a8f42b8a4 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vp9/encoder/vp9_block.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
@@ -93,18 +94,17 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = 0;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  const intptr_t n_coeffs = 32 * 32;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
   (void)scan;
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
@@ -140,10 +140,11 @@ void vpx_highbd_quantize_b_32x32_sse2(
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+        (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

From 14fc40040ff30486c45111056db44ee18590a24a Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Mon, 14 Nov 2022 16:47:33 +0900
Subject: [PATCH 562/926] quantize: use scan_order instead of passing
 scan/iscan

further reduces the arguments for the 32x32. This will be applied to the base
version as well.

Change-Id: I25a162b5248b14af53d9e20c6a7fa2a77028a6d1
---
 test/vp9_quantize_test.cc                 | 41 +++++++++++------------
 vp9/common/vp9_scan.h                     |  2 +-
 vp9/encoder/vp9_encodemb.c                |  8 ++---
 vpx_dsp/arm/highbd_quantize_neon.c        |  7 ++--
 vpx_dsp/arm/quantize_neon.c               |  7 ++--
 vpx_dsp/quantize.c                        |  9 ++---
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  5 +--
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c |  5 +--
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c |  5 +--
 vpx_dsp/x86/quantize_avx.c                |  7 ++--
 vpx_dsp/x86/quantize_avx2.c               |  5 +--
 vpx_dsp/x86/quantize_ssse3.c              |  6 ++--
 12 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index e533b2509c..630a0053ab 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                              const macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
-                             const int16_t *scan, const int16_t *iscan);
+                             const struct scan_order *const scan_order);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
@@ -60,9 +60,10 @@ template <QuantizeBaseFunc fn>
 void QuantWrapper(const tran_low_t *coeff, intptr_t count,
                   const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                   tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                  const int16_t *scan, const int16_t *iscan) {
+                  const struct scan_order *const scan_order) {
   fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
-     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan,
+     scan_order->iscan);
 }
 
 // Wrapper for 32x32 version which does not use count
@@ -70,16 +71,16 @@ typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
                                   const macroblock_plane *const mb_plane,
                                   tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                   const int16_t *dequant, uint16_t *eob,
-                                  const int16_t *scan, const int16_t *iscan);
+                                  const struct scan_order *const scan_order);
 
 template <Quantize32x32Func fn>
 void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
                        const macroblock_plane *const mb_plane,
                        tran_low_t *qcoeff, tran_low_t *dqcoeff,
                        const int16_t *dequant, uint16_t *eob,
-                       const int16_t *scan, const int16_t *iscan) {
+                       const struct scan_order *const scan_order) {
   (void)count;
-  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
 }
 
 // Wrapper for FP version which does not use zbin or quant_shift.
@@ -93,9 +94,9 @@ template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
                     const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                     tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                    const int16_t *scan, const int16_t *iscan) {
+                    const struct scan_order *const scan_order) {
   fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
-     dequant, eob, scan, iscan);
+     dequant, eob, scan_order->scan, scan_order->iscan);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -225,7 +226,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 void VP9QuantizeTest::Run() {
   quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_,
-               &eob_, scan_->scan, scan_->iscan);
+               &eob_, scan_);
 }
 
 void VP9QuantizeTest::Speed(bool is_median) {
@@ -298,7 +299,7 @@ void VP9QuantizeTest::Speed(bool is_median) {
           ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                            ref_qcoeff.TopLeftPixel(),
                            ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                           scan_->scan, scan_->iscan);
+                           scan_);
         }
         vpx_usec_timer_mark(&timer);
 
@@ -306,7 +307,7 @@ void VP9QuantizeTest::Speed(bool is_median) {
         for (int n = 0; n < kNumTests; ++n) {
           quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                        qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-                       dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
+                       dequant_ptr_, &eob_, scan_);
         }
         vpx_usec_timer_mark(&simd_timer);
 
@@ -447,12 +448,11 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
                          quant_fp_ptr_);
     ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                      ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+                     dequant_ptr_, &ref_eob, scan_);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
-                                          &mb_plane_, qcoeff_.TopLeftPixel(),
-                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
-                                          &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -504,12 +504,11 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
                          quant_fp_ptr_);
     ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                      ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+                     dequant_ptr_, &ref_eob, scan_);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
-                                          &mb_plane_, qcoeff_.TopLeftPixel(),
-                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
-                                          &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 72a9a5ec47..efa0e23365 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -23,7 +23,7 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef struct {
+typedef struct scan_order {
   const int16_t *scan;
   const int16_t *iscan;
   const int16_t *neighbors;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 6a5f628808..515c7a9031 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -512,7 +512,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                                    scan_order->scan, scan_order->iscan);
+                                    scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
@@ -542,7 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                           scan_order->scan, scan_order->iscan);
+                           scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
@@ -856,7 +856,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
           vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
-                                      eob, scan_order->scan, scan_order->iscan);
+                                      eob, scan_order);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -946,7 +946,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                             scan_order->scan, scan_order->iscan);
+                             scan_order);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 3b1fec3321..5a40f1284e 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
@@ -227,10 +228,11 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
 void vpx_highbd_quantize_b_32x32_neon(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   // High half has identical elements, but we can reconstruct it from the low
@@ -300,7 +302,4 @@ void vpx_highbd_quantize_b_32x32_neon(
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
-  // Need this here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)scan;
 }
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index e81738a7bb..84b6d8c79f 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
@@ -218,10 +219,11 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
+                               const struct scan_order *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
@@ -285,7 +287,4 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
-  // Need these here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index c4642812ad..f51bf253e7 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,6 +14,7 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
@@ -213,7 +214,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
                             const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
+                            const struct scan_order *const scan_order) {
   const int n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
@@ -221,11 +222,11 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
   const int16_t *round_ptr = mb_plane->round;
   const int16_t *quant_ptr = mb_plane->quant;
   const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -274,7 +275,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
 void vpx_highbd_quantize_b_32x32_c(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
   const intptr_t n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
@@ -282,11 +283,11 @@ void vpx_highbd_quantize_b_32x32_c(
   const int16_t *round_ptr = mb_plane->round;
   const int16_t *quant_ptr = mb_plane->quant;
   const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index a326e2b34c..b6f5d4a099 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -19,6 +19,7 @@ ()
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_VP9_ENCODER
  struct macroblock_plane;
+ struct scan_order;
 #endif
 
 EOF
@@ -724,14 +725,14 @@ ()
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index 6041d7289a..bfd7b2e23e 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,6 +11,7 @@
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
@@ -225,12 +226,12 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
 void vpx_highbd_quantize_b_32x32_avx2(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
   const unsigned int step = 8;
   intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
-  (void)scan;
 
   init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
           mb_plane->quant_shift, qp, 1);
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 6a8f42b8a4..58d5a3a5ff 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -96,16 +97,16 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 void vpx_highbd_quantize_b_32x32_sse2(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = 0;
   const intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
   const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
-  (void)scan;
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index d52f6c6644..d05a937be1 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -19,6 +19,8 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -144,10 +146,11 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
                               const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct scan_order *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -156,8 +159,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-
   load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
                      &shift);
 
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index a8412c5b8e..1c82542ae6 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
@@ -255,11 +256,11 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
+                               const struct scan_order *const scan_order) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
                      mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 6fe54d7d98..6401b2865d 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,6 +16,7 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -112,9 +113,10 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
                                 const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct scan_order *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -123,8 +125,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-
   load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
                      &shift);
 

From 096cd0ba8ab2126682a9f6f01d6c8c0084d2f8ab Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 27 Feb 2023 17:58:18 +0000
Subject: [PATCH 563/926] Optimize Neon implementation of high bitdepth MSE
 functions

Currently MSE functions just call the variance helpers but don't
actually use the computed sum. This patch adds dedicated helpers to
perform the computation of sse.

Add the corresponding tests as well.

Change-Id: I96a8590e3410e84d77f7187344688e02efe03902
---
 test/variance_test.cc              |  16 +++
 vpx_dsp/arm/highbd_variance_neon.c | 197 ++++++++++++++++++++++-------
 2 files changed, 169 insertions(+), 44 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index a68cfad516..1359bc4baf 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1507,6 +1507,22 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12),
+        MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12),
+        MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12),
+        MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12),
+        MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10),
+        MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10),
+        MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10),
+        MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10),
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDVarianceTest,
     ::testing::Values(
diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
index 89bd5c579d..d0b366c95b 100644
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -351,50 +351,159 @@ HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
     *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                      \
   }
 
-#define HIGHBD_MSE(w, h)                                              \
-  uint32_t vpx_highbd_8_mse##w##x##h##_neon(                          \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
-      int ref_stride, uint32_t *sse) {                                \
-    uint64_t sse_long = 0;                                            \
-    int64_t sum_long = 0;                                             \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
-    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
-                                 &sse_long, &sum_long);               \
-    *sse = (uint32_t)sse_long;                                        \
-    return *sse;                                                      \
-  }                                                                   \
-                                                                      \
-  uint32_t vpx_highbd_10_mse##w##x##h##_neon(                         \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
-      int ref_stride, uint32_t *sse) {                                \
-    uint64_t sse_long = 0;                                            \
-    int64_t sum_long = 0;                                             \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
-    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
-                                 &sse_long, &sum_long);               \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
-    return *sse;                                                      \
-  }                                                                   \
-                                                                      \
-  uint32_t vpx_highbd_12_mse##w##x##h##_neon(                         \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
-      int ref_stride, uint32_t *sse) {                                \
-    uint64_t sse_long = 0;                                            \
-    int64_t sum_long = 0;                                             \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
-    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
-                                 &sse_long, &sum_long);               \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
-    return *sse;                                                      \
-  }
-
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
 
-HIGHBD_MSE(16, 16)
-HIGHBD_MSE(16, 8)
-HIGHBD_MSE(8, 16)
-HIGHBD_MSE(8, 8)
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           unsigned int *sse) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse_u32[0] =
+          vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+      sse_u32[1] =
+          vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return *sse;
+}
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            unsigned int *sse) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  *sse = horizontal_add_uint32x4(sse_u32);
+  return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+                                             int src_stride,
+                                             const uint16_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             unsigned int *sse) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    s1 = vld1q_u16(src_ptr + 8);
+    r0 = vld1q_u16(ref_ptr);
+    r1 = vld1q_u16(ref_ptr + 8);
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sse = horizontal_add_uint32x4(sse_u32);
+  return *sse;
+}
+
+#else  // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            unsigned int *sse) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h,
+                             sse);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+                                             int src_stride,
+                                             const uint16_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             unsigned int *sse) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h,
+                             sse);
+}
+
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
+#define HIGHBD_MSE_WXH_NEON(w, h)                                       \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon(                            \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,   \
+      int ref_stride, uint32_t *sse) {                                  \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                       \
+    highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \
+    return *sse;                                                        \
+  }                                                                     \
+                                                                        \
+  uint32_t vpx_highbd_10_mse##w##x##h##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,   \
+      int ref_stride, uint32_t *sse) {                                  \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                       \
+    highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                 \
+    return *sse;                                                        \
+  }                                                                     \
+                                                                        \
+  uint32_t vpx_highbd_12_mse##w##x##h##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,   \
+      int ref_stride, uint32_t *sse) {                                  \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                       \
+    highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                 \
+    return *sse;                                                        \
+  }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)

From 7478b7e4e481562a4a13f233acb66a60462e1934 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 1 Mar 2023 12:14:51 -0800
Subject: [PATCH 564/926] Revert "Implement highbd_d63_predictor using Neon"

This reverts commit 7cdf139e3d6237386e0f93bdb0bdc1b459c663bf.

This causes failures in the VP9/ExternalFrameBufferMD5Test and
VP9/TestVectorTest.MD5Match tests in both armv7 and aarch64 builds.

Change-Id: I7ac4ba0ddc70e7e7860df9f962e6658defe1cdd5
---
 test/test_intra_pred_speed.cc       |  12 +-
 test/vp9_intrapred_test.cc          |  24 ---
 vpx_dsp/arm/highbd_intrapred_neon.c | 278 ----------------------------
 vpx_dsp/vpx_dsp_rtcd_defs.pl        |   8 +-
 4 files changed, 8 insertions(+), 314 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index e721a459ad..24af471eaa 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -566,16 +566,14 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
     vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon,
-    nullptr, nullptr, vpx_highbd_d63_predictor_4x4_neon,
-    vpx_highbd_tm_predictor_4x4_neon)
+    nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
     vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon,
-    nullptr, nullptr, vpx_highbd_d63_predictor_8x8_neon,
-    vpx_highbd_tm_predictor_8x8_neon)
+    nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_8x8_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
     vpx_highbd_dc_left_predictor_16x16_neon,
@@ -583,8 +581,7 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
     vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
     vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon,
-    nullptr, nullptr, vpx_highbd_d63_predictor_16x16_neon,
-    vpx_highbd_tm_predictor_16x16_neon)
+    nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
     vpx_highbd_dc_left_predictor_32x32_neon,
@@ -592,8 +589,7 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
     vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
     vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon,
-    nullptr, nullptr, vpx_highbd_d63_predictor_32x32_neon,
-    vpx_highbd_tm_predictor_32x32_neon)
+    nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index c4e0e78ac5..83e371df6e 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -848,14 +848,6 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 8),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
-                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
-                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
-                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
-                             &vpx_highbd_d63_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
                              &vpx_highbd_d117_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
@@ -940,14 +932,6 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 10),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
-                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
-                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
-                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
-                             &vpx_highbd_d63_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
                              &vpx_highbd_d117_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
@@ -1032,14 +1016,6 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 12),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
-                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
-                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
-                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
-        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
-                             &vpx_highbd_d63_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
                              &vpx_highbd_d117_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 424bf5f4bc..d1e335c263 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -453,284 +453,6 @@ void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
-void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  uint16x4_t a0, a1, a2, a3, d0, d1;
-  (void)left;
-  (void)bd;
-
-  a0 = vld1_u16(above + 0);
-  a1 = vld1_u16(above + 1);
-  a2 = vld1_u16(above + 2);
-  a3 = vld1_dup_u16(above + 3);
-
-  d0 = vrhadd_u16(a0, a1);
-  d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
-
-  vst1_u16(dst + 0 * stride, d0);
-  vst1_u16(dst + 1 * stride, d1);
-  vst1_u16(dst + 2 * stride, vext_u16(d0, a3, 1));
-  vst1_u16(dst + 3 * stride, vext_u16(d1, a3, 1));
-}
-
-void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  uint16x8_t a0, a1, a2, a7, d0, d1;
-  (void)left;
-  (void)bd;
-
-  a0 = vld1q_u16(above + 0);
-  a1 = vld1q_u16(above + 1);
-  a2 = vld1q_u16(above + 2);
-  a7 = vld1q_dup_u16(above + 7);
-
-  d0 = vrhaddq_u16(a0, a1);
-  d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
-
-  vst1q_u16(dst + 0 * stride, d0);
-  vst1q_u16(dst + 1 * stride, d1);
-  vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 1));
-  vst1q_u16(dst + 3 * stride, vextq_u16(d1, a7, 1));
-  vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 2));
-  vst1q_u16(dst + 5 * stride, vextq_u16(d1, a7, 2));
-  vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 3));
-  vst1q_u16(dst + 7 * stride, vextq_u16(d1, a7, 3));
-}
-
-void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0_lo, d0_hi, d1_lo, d1_hi;
-  (void)left;
-  (void)bd;
-
-  a0 = vld1q_u16(above + 0);
-  a1 = vld1q_u16(above + 1);
-  a2 = vld1q_u16(above + 2);
-  a8 = vld1q_u16(above + 8);
-  a9 = vld1q_u16(above + 9);
-  a10 = vld1q_u16(above + 10);
-  a15 = vld1q_dup_u16(above + 15);
-
-  d0_lo = vrhaddq_u16(a0, a1);
-  d0_hi = vrhaddq_u16(a8, a9);
-  d1_lo = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
-  d1_hi = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
-
-  vst1q_u16(dst + 0 * stride + 0, d0_lo);
-  vst1q_u16(dst + 0 * stride + 8, d0_hi);
-  vst1q_u16(dst + 1 * stride + 0, d1_lo);
-  vst1q_u16(dst + 1 * stride + 8, d1_hi);
-  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0_lo, d0_hi, 1));
-  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_hi, a15, 1));
-  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1_lo, d1_hi, 1));
-  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_hi, a15, 1));
-  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0_lo, d0_hi, 2));
-  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_hi, a15, 2));
-  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1_lo, d1_hi, 2));
-  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_hi, a15, 2));
-  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0_lo, d0_hi, 3));
-  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_hi, a15, 3));
-  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1_lo, d1_hi, 3));
-  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_hi, a15, 3));
-  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0_lo, d0_hi, 4));
-  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_hi, a15, 4));
-  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1_lo, d1_hi, 4));
-  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_hi, a15, 4));
-  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0_lo, d0_hi, 5));
-  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_hi, a15, 5));
-  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1_lo, d1_hi, 5));
-  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_hi, a15, 5));
-  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0_lo, d0_hi, 6));
-  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_hi, a15, 6));
-  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1_lo, d1_hi, 6));
-  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_hi, a15, 6));
-  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0_lo, d0_hi, 7));
-  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_hi, a15, 7));
-  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1_lo, d1_hi, 7));
-  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_hi, a15, 7));
-}
-
-void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
-      d1[4];
-  (void)left;
-  (void)bd;
-
-  a0 = vld1q_u16(above + 0);
-  a1 = vld1q_u16(above + 1);
-  a2 = vld1q_u16(above + 2);
-  a8 = vld1q_u16(above + 8);
-  a9 = vld1q_u16(above + 9);
-  a10 = vld1q_u16(above + 10);
-  a16 = vld1q_u16(above + 16);
-  a17 = vld1q_u16(above + 17);
-  a18 = vld1q_u16(above + 18);
-  a24 = vld1q_u16(above + 24);
-  a25 = vld1q_u16(above + 25);
-  a26 = vld1q_u16(above + 26);
-  a31 = vld1q_dup_u16(above + 31);
-
-  d0[0] = vrhaddq_u16(a0, a1);
-  d0[1] = vrhaddq_u16(a8, a9);
-  d0[2] = vrhaddq_u16(a16, a17);
-  d0[3] = vrhaddq_u16(a24, a25);
-  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
-  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
-  d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
-  d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
-
-  vst1q_u16(dst + 0 * stride + 0, d0[0]);
-  vst1q_u16(dst + 0 * stride + 8, d0[1]);
-  vst1q_u16(dst + 0 * stride + 16, d0[2]);
-  vst1q_u16(dst + 0 * stride + 24, d0[3]);
-  vst1q_u16(dst + 1 * stride + 0, d1[0]);
-  vst1q_u16(dst + 1 * stride + 8, d1[1]);
-  vst1q_u16(dst + 1 * stride + 16, d1[2]);
-  vst1q_u16(dst + 1 * stride + 24, d1[3]);
-
-  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
-  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
-  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
-  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[3], a31, 1));
-  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
-  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
-  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
-  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[3], a31, 1));
-
-  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
-  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
-  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
-  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[3], a31, 2));
-  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
-  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
-  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
-  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[3], a31, 2));
-
-  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
-  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
-  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
-  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[3], a31, 3));
-  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
-  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
-  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
-  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[3], a31, 3));
-
-  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
-  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
-  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
-  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[3], a31, 4));
-  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
-  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
-  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
-  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[3], a31, 4));
-
-  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
-  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
-  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
-  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[3], a31, 5));
-  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
-  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
-  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
-  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[3], a31, 5));
-
-  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
-  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
-  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
-  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[3], a31, 6));
-  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
-  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
-  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
-  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[3], a31, 6));
-
-  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
-  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
-  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
-  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[3], a31, 7));
-  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
-  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
-  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
-  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[3], a31, 7));
-
-  vst1q_u16(dst + 16 * stride + 0, d0[1]);
-  vst1q_u16(dst + 16 * stride + 8, d0[2]);
-  vst1q_u16(dst + 16 * stride + 16, d0[3]);
-  vst1q_u16(dst + 16 * stride + 24, a31);
-  vst1q_u16(dst + 17 * stride + 0, d1[1]);
-  vst1q_u16(dst + 17 * stride + 8, d1[2]);
-  vst1q_u16(dst + 17 * stride + 16, d1[3]);
-  vst1q_u16(dst + 17 * stride + 24, a31);
-
-  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
-  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
-  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[3], a31, 1));
-  vst1q_u16(dst + 18 * stride + 24, a31);
-  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
-  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
-  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[3], a31, 1));
-  vst1q_u16(dst + 19 * stride + 24, a31);
-
-  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
-  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
-  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[3], a31, 2));
-  vst1q_u16(dst + 20 * stride + 24, a31);
-  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
-  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
-  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[3], a31, 2));
-  vst1q_u16(dst + 21 * stride + 24, a31);
-
-  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
-  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
-  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[3], a31, 3));
-  vst1q_u16(dst + 22 * stride + 24, a31);
-  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
-  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
-  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[3], a31, 3));
-  vst1q_u16(dst + 23 * stride + 24, a31);
-
-  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
-  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
-  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[3], a31, 4));
-  vst1q_u16(dst + 24 * stride + 24, a31);
-  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
-  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
-  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[3], a31, 4));
-  vst1q_u16(dst + 25 * stride + 24, a31);
-
-  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
-  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
-  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[3], a31, 5));
-  vst1q_u16(dst + 26 * stride + 24, a31);
-  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
-  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
-  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[3], a31, 5));
-  vst1q_u16(dst + 27 * stride + 24, a31);
-
-  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
-  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
-  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[3], a31, 6));
-  vst1q_u16(dst + 28 * stride + 24, a31);
-  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
-  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
-  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[3], a31, 6));
-  vst1q_u16(dst + 29 * stride + 24, a31);
-
-  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
-  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
-  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[3], a31, 7));
-  vst1q_u16(dst + 30 * stride + 24, a31);
-  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
-  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
-  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[3], a31, 7));
-  vst1q_u16(dst + 31 * stride + 24, a31);
-}
-
-// -----------------------------------------------------------------------------
-
 void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b6f5d4a099..072b10d3d1 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -221,7 +221,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/;
+  specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
@@ -260,7 +260,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
@@ -299,7 +299,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
@@ -338,7 +338,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;

From d98a7b8bd937f4b846beb3df76271a5f91d86d5f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 1 Mar 2023 15:52:20 -0800
Subject: [PATCH 565/926] Revert "quantize: use scan_order instead of passing
 scan/iscan"

This reverts commit 14fc40040ff30486c45111056db44ee18590a24a.

This has alignment issues, causing crashes in the tests:
SSSE3/VP9QuantizeTest.EOBCheck/*

Change-Id: I934f9a4c3ce3db33058a65180fa645c8649c3670
---
 test/vp9_quantize_test.cc                 | 41 ++++++++++++-----------
 vp9/common/vp9_scan.h                     |  2 +-
 vp9/encoder/vp9_encodemb.c                |  8 ++---
 vpx_dsp/arm/highbd_quantize_neon.c        |  7 ++--
 vpx_dsp/arm/quantize_neon.c               |  7 ++--
 vpx_dsp/quantize.c                        |  9 +++--
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  5 ++-
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c |  5 ++-
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c |  5 ++-
 vpx_dsp/x86/quantize_avx.c                |  7 ++--
 vpx_dsp/x86/quantize_avx2.c               |  5 ++-
 vpx_dsp/x86/quantize_ssse3.c              |  6 ++--
 12 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 630a0053ab..e533b2509c 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                              const macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
-                             const struct scan_order *const scan_order);
+                             const int16_t *scan, const int16_t *iscan);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
@@ -60,10 +60,9 @@ template <QuantizeBaseFunc fn>
 void QuantWrapper(const tran_low_t *coeff, intptr_t count,
                   const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                   tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                  const struct scan_order *const scan_order) {
+                  const int16_t *scan, const int16_t *iscan) {
   fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
-     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan,
-     scan_order->iscan);
+     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
 }
 
 // Wrapper for 32x32 version which does not use count
@@ -71,16 +70,16 @@ typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
                                   const macroblock_plane *const mb_plane,
                                   tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                   const int16_t *dequant, uint16_t *eob,
-                                  const struct scan_order *const scan_order);
+                                  const int16_t *scan, const int16_t *iscan);
 
 template <Quantize32x32Func fn>
 void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
                        const macroblock_plane *const mb_plane,
                        tran_low_t *qcoeff, tran_low_t *dqcoeff,
                        const int16_t *dequant, uint16_t *eob,
-                       const struct scan_order *const scan_order) {
+                       const int16_t *scan, const int16_t *iscan) {
   (void)count;
-  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
 }
 
 // Wrapper for FP version which does not use zbin or quant_shift.
@@ -94,9 +93,9 @@ template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
                     const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                     tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                    const struct scan_order *const scan_order) {
+                    const int16_t *scan, const int16_t *iscan) {
   fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
-     dequant, eob, scan_order->scan, scan_order->iscan);
+     dequant, eob, scan, iscan);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -226,7 +225,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 void VP9QuantizeTest::Run() {
   quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_,
-               &eob_, scan_);
+               &eob_, scan_->scan, scan_->iscan);
 }
 
 void VP9QuantizeTest::Speed(bool is_median) {
@@ -299,7 +298,7 @@ void VP9QuantizeTest::Speed(bool is_median) {
           ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                            ref_qcoeff.TopLeftPixel(),
                            ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                           scan_);
+                           scan_->scan, scan_->iscan);
         }
         vpx_usec_timer_mark(&timer);
 
@@ -307,7 +306,7 @@ void VP9QuantizeTest::Speed(bool is_median) {
         for (int n = 0; n < kNumTests; ++n) {
           quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                        qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-                       dequant_ptr_, &eob_, scan_);
+                       dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
         }
         vpx_usec_timer_mark(&simd_timer);
 
@@ -448,11 +447,12 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
                          quant_fp_ptr_);
     ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                      ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_);
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(),
-        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          &mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -504,11 +504,12 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
                          quant_fp_ptr_);
     ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
                      ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_);
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(),
-        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          &mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index efa0e23365..72a9a5ec47 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -23,7 +23,7 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef struct scan_order {
+typedef struct {
   const int16_t *scan;
   const int16_t *iscan;
   const int16_t *neighbors;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 515c7a9031..6a5f628808 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -512,7 +512,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                                    scan_order);
+                                    scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
@@ -542,7 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                           scan_order);
+                           scan_order->scan, scan_order->iscan);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
@@ -856,7 +856,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
           vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
-                                      eob, scan_order);
+                                      eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -946,7 +946,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                             scan_order);
+                             scan_order->scan, scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 5a40f1284e..3b1fec3321 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -13,7 +13,6 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
-#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
@@ -228,11 +227,10 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
 void vpx_highbd_quantize_b_32x32_neon(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
-  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   // High half has identical elements, but we can reconstruct it from the low
@@ -302,4 +300,7 @@ void vpx_highbd_quantize_b_32x32_neon(
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
+  // Need this here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)scan;
 }
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index 84b6d8c79f..e81738a7bb 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,7 +14,6 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
-#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
@@ -219,11 +218,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const struct scan_order *const scan_order) {
+                               const int16_t *scan, const int16_t *iscan) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
-  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
@@ -287,4 +285,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index f51bf253e7..c4642812ad 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,7 +14,6 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
@@ -214,7 +213,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
                             const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const struct scan_order *const scan_order) {
+                            const int16_t *scan, const int16_t *iscan) {
   const int n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
@@ -222,11 +221,11 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
   const int16_t *round_ptr = mb_plane->round;
   const int16_t *quant_ptr = mb_plane->quant;
   const int16_t *quant_shift_ptr = mb_plane->quant_shift;
-  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
+  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -275,7 +274,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
 void vpx_highbd_quantize_b_32x32_c(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   const intptr_t n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
@@ -283,11 +282,11 @@ void vpx_highbd_quantize_b_32x32_c(
   const int16_t *round_ptr = mb_plane->round;
   const int16_t *quant_ptr = mb_plane->quant;
   const int16_t *quant_shift_ptr = mb_plane->quant_shift;
-  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
+  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 072b10d3d1..c899c467bb 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -19,7 +19,6 @@ ()
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_VP9_ENCODER
  struct macroblock_plane;
- struct scan_order;
 #endif
 
 EOF
@@ -725,14 +724,14 @@ ()
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index bfd7b2e23e..6041d7289a 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,7 +11,6 @@
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
-#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
@@ -226,12 +225,12 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
 void vpx_highbd_quantize_b_32x32_avx2(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   const unsigned int step = 8;
   intptr_t n_coeffs = 32 * 32;
-  const int16_t *iscan = scan_order->iscan;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
+  (void)scan;
 
   init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
           mb_plane->quant_shift, qp, 1);
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 58d5a3a5ff..6a8f42b8a4 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,7 +15,6 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -97,16 +96,16 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 void vpx_highbd_quantize_b_32x32_sse2(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = 0;
   const intptr_t n_coeffs = 32 * 32;
-  const int16_t *iscan = scan_order->iscan;
   const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
+  (void)scan;
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index d05a937be1..d52f6c6644 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -19,8 +19,6 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
-#include "vp9/common/vp9_scan.h"
-#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -146,11 +144,10 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
                               const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const struct scan_order *const scan_order) {
+                              const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
-  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -159,6 +156,8 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
+  (void)scan;
+
   load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
                      &shift);
 
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 1c82542ae6..a8412c5b8e 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,7 +13,6 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
@@ -256,11 +255,11 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const struct scan_order *const scan_order) {
+                               const int16_t *scan, const int16_t *iscan) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  const int16_t *iscan = scan_order->iscan;
+  (void)scan;
 
   load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
                      mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 6401b2865d..6fe54d7d98 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,7 +16,6 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
-#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -113,10 +112,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
                                 const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const struct scan_order *const scan_order) {
+                                const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   int index;
-  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -125,6 +123,8 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
+  (void)scan;
+
   load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
                      &shift);
 

From e4b423e1400436f07b9e3945718a8696e5eca9ec Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 1 Mar 2023 15:53:14 -0800
Subject: [PATCH 566/926] Revert "quantize: simplifly highbd 32x32_b args"

This reverts commit 573f5e662b544dbc553d73fa2b61055c30dfe8cc.

This has alignment issues, causing crashes in the tests:
SSSE3/VP9QuantizeTest.EOBCheck/*

Change-Id: Ibf05e6b116c46f6e2c11187b3e3578bbd2d2c227
---
 test/vp9_quantize_test.cc                 | 54 +++++++++++------------
 vp9/encoder/vp9_encodemb.c                | 10 +++--
 vpx_dsp/arm/highbd_quantize_neon.c        | 21 ++++-----
 vpx_dsp/quantize.c                        | 16 +++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  2 +-
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 13 +++---
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 21 +++++----
 7 files changed, 68 insertions(+), 69 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index e533b2509c..ecb6116f0c 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -549,15 +549,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
                    &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
                    false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_10, 32, false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
@@ -626,15 +626,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_10, 32, false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
@@ -672,15 +672,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_10, 32, false),
-        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
-                   VPX_BITS_12, 32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 6a5f628808..4910dc20f5 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -511,8 +511,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                                    scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(
+            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
+            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
@@ -855,8 +856,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
-                                      eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(
+              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
+              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 3b1fec3321..b9f72a94c5 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -13,7 +13,6 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
-#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
     const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
@@ -225,9 +224,11 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
 }
 
 void vpx_highbd_quantize_b_32x32_neon(
-    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
@@ -236,13 +237,12 @@ void vpx_highbd_quantize_b_32x32_neon(
   // High half has identical elements, but we can reconstruct it from the low
   // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
   // vector
-  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
-  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
+  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1);
+  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1);
   // Extend the quant, quant_shift vectors to ones of 32-bit elements
   // scale to high-half, so we can use vqdmulhq_s32
-  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
-  int32x4_t quant_shift =
-      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16);
   int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
 
   // Process first 8 values which include a dc component.
@@ -300,7 +300,8 @@ void vpx_highbd_quantize_b_32x32_neon(
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
-  // Need this here, else the compiler complains about mixing declarations and
+  // Need these here, else the compiler complains about mixing declarations and
   // code in C90
+  (void)n_coeffs;
   (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index c4642812ad..212db45c88 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -272,16 +272,14 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
-  const intptr_t n_coeffs = 32 * 32;
-  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
-                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  const int16_t *round_ptr = mb_plane->round;
-  const int16_t *quant_ptr = mb_plane->quant;
-  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   int idx = 0;
   int idx_arr[1024];
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c899c467bb..2752eea5d9 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -731,7 +731,7 @@ ()
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index 6041d7289a..8edddd637f 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,7 +11,6 @@
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
-#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
   const __m128i sign = _mm_srai_epi16(*p, 15);
@@ -223,17 +222,17 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
 }
 
 void vpx_highbd_quantize_b_32x32_avx2(
-    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
   const unsigned int step = 8;
-  intptr_t n_coeffs = 32 * 32;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
   (void)scan;
 
-  init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
-          mb_plane->quant_shift, qp, 1);
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
 
   quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
 
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 6a8f42b8a4..ae1981a834 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,7 +15,6 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vp9/encoder/vp9_block.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
@@ -94,17 +93,18 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = 0;
-  const intptr_t n_coeffs = 32 * 32;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
@@ -140,11 +140,10 @@ void vpx_highbd_quantize_b_32x32_sse2(
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 =
-        abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

From 508bfc1ff4c2d56353dcb845b59158256c434200 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 1 Mar 2023 15:53:18 -0800
Subject: [PATCH 567/926] Revert "quantize: simplify 32x32_b args"

This reverts commit 848f6e733789c627b6606baf1c85e32be997e36f.

This has alignment issues, causing crashes in the tests:
SSSE3/VP9QuantizeTest.EOBCheck/*

Change-Id: Ic12014ab0a78ed3cde02d642509061552cdc8fc9
---
 test/vp9_quantize_test.cc    | 285 ++++++++++++++---------------------
 vp9/encoder/vp9_block.h      |   1 -
 vp9/encoder/vp9_encodemb.c   |   6 +-
 vpx_dsp/arm/quantize_neon.c  |  17 ++-
 vpx_dsp/quantize.c           |  17 +--
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   5 +-
 vpx_dsp/x86/quantize_avx.c   |  30 +++-
 vpx_dsp/x86/quantize_avx2.c  |  15 +-
 vpx_dsp/x86/quantize_sse2.h  |  28 ----
 vpx_dsp/x86/quantize_ssse3.c |  35 ++++-
 10 files changed, 201 insertions(+), 238 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index ecb6116f0c..587cec6923 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -26,7 +26,6 @@
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
-#include "vp9/encoder/vp9_block.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
@@ -39,7 +38,8 @@ namespace {
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             const macroblock_plane *const mb_plane,
+                             const int16_t *zbin, const int16_t *round,
+                             const int16_t *quant, const int16_t *quant_shift,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
                              const int16_t *scan, const int16_t *iscan);
@@ -47,41 +47,6 @@ typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
-// Wrapper which takes a macroblock_plane.
-typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count,
-                                 const int16_t *zbin, const int16_t *round,
-                                 const int16_t *quant,
-                                 const int16_t *quant_shift, tran_low_t *qcoeff,
-                                 tran_low_t *dqcoeff, const int16_t *dequant,
-                                 uint16_t *eob, const int16_t *scan,
-                                 const int16_t *iscan);
-
-template <QuantizeBaseFunc fn>
-void QuantWrapper(const tran_low_t *coeff, intptr_t count,
-                  const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
-                  tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                  const int16_t *scan, const int16_t *iscan) {
-  fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
-     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
-}
-
-// Wrapper for 32x32 version which does not use count
-typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
-                                  const macroblock_plane *const mb_plane,
-                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                                  const int16_t *dequant, uint16_t *eob,
-                                  const int16_t *scan, const int16_t *iscan);
-
-template <Quantize32x32Func fn>
-void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
-                       const macroblock_plane *const mb_plane,
-                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                       const int16_t *dequant, uint16_t *eob,
-                       const int16_t *scan, const int16_t *iscan) {
-  (void)count;
-  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
-}
-
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
                                const int16_t *round, const int16_t *quant,
@@ -91,11 +56,15 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
 
 template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
-                    const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
-                    tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                    const int16_t *scan, const int16_t *iscan) {
-  fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
-     dequant, eob, scan, iscan);
+                    const int16_t *zbin, const int16_t *round,
+                    const int16_t *quant, const int16_t *quant_shift,
+                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
+                    const int16_t *iscan) {
+  (void)zbin;
+  (void)quant_shift;
+
+  fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -150,16 +119,17 @@ class VP9QuantizeBase : public AbstractBench {
 #else
     max_value_ = (1 << bit_depth_) - 1;
 #endif
-    zbin_ptr_ = mb_plane_.zbin =
+    zbin_ptr_ =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
-    round_fp_ptr_ = mb_plane_.round_fp;
-    quant_fp_ptr_ = mb_plane_.quant_fp = reinterpret_cast<int16_t *>(
+    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
+    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
-    round_ptr_ = mb_plane_.round =
+    round_ptr_ =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
-    quant_ptr_ = mb_plane_.quant =
+    quant_ptr_ =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
-    quant_shift_ptr_ = mb_plane_.quant_shift = reinterpret_cast<int16_t *>(
+    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
     dequant_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
@@ -170,6 +140,7 @@ class VP9QuantizeBase : public AbstractBench {
 
   ~VP9QuantizeBase() {
     vpx_free(zbin_ptr_);
+    vpx_free(round_fp_ptr_);
     vpx_free(quant_fp_ptr_);
     vpx_free(round_ptr_);
     vpx_free(quant_ptr_);
@@ -186,7 +157,6 @@ class VP9QuantizeBase : public AbstractBench {
   }
 
  protected:
-  macroblock_plane mb_plane_;
   int16_t *zbin_ptr_;
   int16_t *round_fp_ptr_;
   int16_t *quant_fp_ptr_;
@@ -223,9 +193,10 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 };
 
 void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
-               qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_,
-               &eob_, scan_->scan, scan_->iscan);
+  quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+               quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
+               scan_->iscan);
 }
 
 void VP9QuantizeTest::Speed(bool is_median) {
@@ -295,8 +266,8 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&timer);
         for (int n = 0; n < kNumTests; ++n) {
-          ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
-                           ref_qcoeff.TopLeftPixel(),
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
+                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
                            ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                            scan_->scan, scan_->iscan);
         }
@@ -304,9 +275,10 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&simd_timer);
         for (int n = 0; n < kNumTests; ++n) {
-          quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
-                       qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-                       dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
+          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
+                       scan_->scan, scan_->iscan);
         }
         vpx_usec_timer_mark(&simd_timer);
 
@@ -445,14 +417,15 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
-                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                     scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
-                                          &mb_plane_, qcoeff_.TopLeftPixel(),
-                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
-                                          &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -502,14 +475,15 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
-                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                     scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
-                                          &mb_plane_, qcoeff_.TopLeftPixel(),
-                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
-                                          &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -536,35 +510,28 @@ using std::make_tuple;
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
-                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                    &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false)));
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
+    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true)));
@@ -574,12 +541,11 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_ssse3>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
-                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
-                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
-                                 VPX_BITS_8, 32, false),
+    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_ssse3,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true),
@@ -589,14 +555,13 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
-INSTANTIATE_TEST_SUITE_P(
-    AVX, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_avx>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
-                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
-                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
-                                 VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
+                         ::testing::Values(make_tuple(&vpx_quantize_b_avx,
+                                                      &vpx_quantize_b_c,
+                                                      VPX_BITS_8, 16, false),
+                                           make_tuple(&vpx_quantize_b_32x32_avx,
+                                                      &vpx_quantize_b_32x32_c,
+                                                      VPX_BITS_8, 32, false)));
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
@@ -612,29 +577,22 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
                    &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
                    32, true),
-        make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
-                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
-                   false),
-        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
-                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false)));
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
@@ -644,12 +602,11 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true),
-                      make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
-                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
-                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
-                                 VPX_BITS_8, 32, false)));
+                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_avx2,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
@@ -658,29 +615,22 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
-                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
-        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
-                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
@@ -689,12 +639,11 @@ INSTANTIATE_TEST_SUITE_P(
 #else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
-                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
-                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
-                                 VPX_BITS_8, 32, false),
+    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_neon,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                  16, true),
@@ -734,11 +683,9 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&QuantWrapper<vpx_quantize_b_c>,
-                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
-        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
-                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
+                   32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index fc27a0fbda..1786952911 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,7 +13,6 @@
 
 #include "vpx_util/vpx_thread.h"
 
-#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4910dc20f5..fa222f9dcf 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -542,7 +542,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+      vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                           p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                            scan_order->scan, scan_order->iscan);
       break;
     case TX_16X16:
@@ -947,7 +948,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                              scan_order->scan, scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index e81738a7bb..9c227d560f 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,7 +14,6 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
-#include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                const int16x8_t dequant,
@@ -214,8 +213,11 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
-                               const struct macroblock_plane *const mb_plane,
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
@@ -224,10 +226,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
   int i;
 
   // Only the first element of each vector is DC.
-  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
-  int16x8_t quant = vld1q_s16(mb_plane->quant);
-  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
@@ -287,5 +289,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
 #endif  // __aarch64__
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
+  (void)n_coeffs;
   (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 212db45c88..5d6ba64a8a 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,7 +14,6 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
@@ -209,21 +208,19 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 #endif
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                            const struct macroblock_plane *const mb_plane,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  const int n_coeffs = 32 * 32;
-  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
-                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  const int16_t *round_ptr = mb_plane->round;
-  const int16_t *quant_ptr = mb_plane->quant;
-  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   int idx = 0;
-  int idx_arr[32 * 32 /* n_coeffs */];
+  int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 2752eea5d9..3bcfa7a541 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -17,9 +17,6 @@ ()
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
-#if CONFIG_VP9_ENCODER
- struct macroblock_plane;
-#endif
 
 EOF
 }
@@ -724,7 +721,7 @@ ()
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index d52f6c6644..7d83527216 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -140,12 +140,15 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
-                              const struct macroblock_plane *const mb_plane,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
                               const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
 
@@ -157,9 +160,26 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
   __m128i eob = zero, eob0;
 
   (void)scan;
-
-  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
-                     &shift);
+  (void)n_coeffs;
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+  shift = _mm_slli_epi16(shift, 1);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index a8412c5b8e..28f7c9c7da 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,7 +13,6 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
     const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
@@ -251,19 +250,23 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
   }
 }
 
-void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
-                               const struct macroblock_plane *const mb_plane,
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
+  (void)n_coeffs;
   (void)scan;
 
-  load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
-                     mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
-                     mb_plane->quant_shift, &v_quant_shift, 1);
+  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+                     &v_quant_shift, 1);
 
   // Do DC and first 15 AC.
   v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index fe42fee018..27bfb4e41b 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,7 +15,6 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/encoder/vp9_block.h"
 
 static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
                                  const int16_t *round_ptr, __m128i *round,
@@ -30,33 +29,6 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
   *shift = _mm_load_si128((const __m128i *)shift_ptr);
 }
 
-static INLINE void load_b_values32x32(
-    const struct macroblock_plane *const mb_plane, __m128i *zbin,
-    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
-    __m128i *dequant, __m128i *shift) {
-  const __m128i one = _mm_set1_epi16(1);
-  // The 32x32 halves zbin and round.
-  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
-  // Shift with rounding.
-  *zbin = _mm_add_epi16(*zbin, one);
-  *zbin = _mm_srli_epi16(*zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  *zbin = _mm_sub_epi16(*zbin, one);
-
-  *round = _mm_load_si128((const __m128i *)mb_plane->round);
-  *round = _mm_add_epi16(*round, one);
-  *round = _mm_srli_epi16(*round, 1);
-
-  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
-  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
-  // I suspect this is not technically OK because quant_shift can be up
-  // to 1 << 16 and shifting up again will outrange that, but the test is not
-  // comprehensive enough to catch that and "it's been that way forever"
-  *shift = _mm_slli_epi16(*shift, 1);
-}
-
 static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
                                   const int16_t *quant_ptr, __m128i *quant,
                                   const int16_t *dequant_ptr,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 6fe54d7d98..476230286d 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,7 +16,6 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
-#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -108,12 +107,16 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
-                                const struct macroblock_plane *const mb_plane,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
   int index;
 
   __m128i zbin, round, quant, dequant, shift;
@@ -124,9 +127,29 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
   __m128i eob = zero, eob0;
 
   (void)scan;
-
-  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
-                     &shift);
+  (void)n_coeffs;
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  shift = _mm_slli_epi16(shift, 1);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);

From 817248e1be1548af10f3d4f0922e01e372d10cea Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 1 Mar 2023 23:54:51 +0000
Subject: [PATCH 568/926] [SSE4_1] Fix overflow in highbd temporal_filter

While porting this function to NEON, using SSE4_1 implementation
as base I noticed that both were producing files with different
checksums to the C reference implementation. After investigating
further I found that this saturating pack was the culprit. Doing
the multiplication on the 32-bit values, leads to producing the
correct results with the C implementation.

Change-Id: I40c2a36551b2db363a58ea9aa19ef327f2676de3
---
 vp9/encoder/x86/highbd_temporal_filter_sse4.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
index a7f5117cff..bcbf6d77e6 100644
--- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c
+++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -141,11 +141,12 @@ static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
   count_u16 = _mm_adds_epu16(count_u16, sum_u16);
   _mm_storeu_si128((__m128i *)count, count_u16);
 
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
   pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
   pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
 
+  pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32);
+  pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32);
+
   accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
   accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
 

From ca0c51f05f6238a68fbc33efedef4e6ec7f0b56d Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Fri, 3 Mar 2023 12:46:01 +0900
Subject: [PATCH 569/926] Revert "Allow macroblock_plane to have its own
 rounding buffer"

This reverts commit 5359ae810cdbb974060297ecf935183baf7b009b.

Reason for revert: Blocks quantize cleanups

Original change's description:
> Allow macroblock_plane to have its own rounding buffer
>
> Add 8 bytes buffer to macroblock_plane to support rounding factor.
>
> Change-Id: I3751689e4449c0caea28d3acf6cd17d7f39508ed

Change-Id: Ia2424d2114207370f0b45350313a5ff8521d25a8
---
 vp9/encoder/vp9_block.h    | 2 +-
 vp9/encoder/vp9_quantize.c | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 1786952911..3e2c9a3c35 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -34,7 +34,7 @@ struct macroblock_plane {
   struct buf_2d src;
 
   // Quantizer setings
-  DECLARE_ALIGNED(16, int16_t, round_fp[8]);
+  int16_t *round_fp;
   int16_t *quant_fp;
   int16_t *quant;
   int16_t *quant_shift;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index dcc44449fd..115c66723d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -249,8 +249,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   // Y
   x->plane[0].quant = quants->y_quant[qindex];
   x->plane[0].quant_fp = quants->y_quant_fp[qindex];
-  memcpy(x->plane[0].round_fp, quants->y_round_fp[qindex],
-         8 * sizeof(*(x->plane[0].round_fp)));
+  x->plane[0].round_fp = quants->y_round_fp[qindex];
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
@@ -262,8 +261,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   for (i = 1; i < 3; i++) {
     x->plane[i].quant = quants->uv_quant[qindex];
     x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
-    memcpy(x->plane[i].round_fp, quants->uv_round_fp[qindex],
-           8 * sizeof(*(x->plane[i].round_fp)));
+    x->plane[i].round_fp = quants->uv_round_fp[qindex];
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];

From 394de691a0ef570fc49943f565ad53ee0d22a7f3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 3 Mar 2023 12:34:36 -0800
Subject: [PATCH 570/926] Revert "Implement d117_predictor using Neon"

This reverts commit 360e9069b6cc1dd3a004728b876fb923413f4b11.

This causes ASan errors:
[ RUN      ] VP9/TestVectorTest.MD5Match/1
=================================================================
==837858==ERROR: AddressSanitizer: stack-buffer-overflow on address
0xffff82ecad40 at pc 0x000000c494d4 bp 0xffffe1695800 sp 0xffffe16957f8
READ of size 16 at 0xffff82ecad40 thread T0
    #0 0xc494d0 in vpx_d117_predictor_32x32_neon (test_libvpx+0xc494d0)
    #1 0x1040b34 in vp9_predict_intra_block (test_libvpx+0x1040b34)
    #2 0xf8feec in decode_block (test_libvpx+0xf8feec)
    #3 0xf8f588 in decode_partition (test_libvpx+0xf8f588)
    #4 0xf7be5c in vp9_decode_frame (test_libvpx+0xf7be5c)
    ...
Address 0xffff82ecad40 is located in stack of thread T0 at offset 64 in
frame
    #0 0x103fd3c in vp9_predict_intra_block (test_libvpx+0x103fd3c)

  This frame has 2 object(s):
    [32, 64) 'left_col.i' <== Memory access at offset 64 overflows this
                              variable
    [96, 176) 'above_data.i'

Change-Id: I058213364617dfe1036126c33a3307f8288d9ae0
---
 test/test_intra_pred_speed.cc |  20 ++-
 test/vp9_intrapred_test.cc    |   8 --
 vpx_dsp/arm/intrapred_neon.c  | 232 ----------------------------------
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |   4 -
 4 files changed, 8 insertions(+), 256 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 24af471eaa..30817553ff 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -269,32 +269,28 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon,
-                nullptr, nullptr, vpx_d63_predictor_4x4_neon,
-                vpx_tm_predictor_4x4_neon)
+                vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr,
+                vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
-                vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon,
-                nullptr, nullptr, vpx_d63_predictor_8x8_neon,
-                vpx_tm_predictor_8x8_neon)
+                vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr,
+                vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
-                vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon,
-                nullptr, nullptr, vpx_d63_predictor_16x16_neon,
-                vpx_tm_predictor_16x16_neon)
+                vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr,
+                vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
-                vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon,
-                nullptr, nullptr, vpx_d63_predictor_32x32_neon,
-                vpx_tm_predictor_32x32_neon)
+                vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr,
+                vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 83e371df6e..7f8e1c5b51 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -251,14 +251,6 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d63_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d63_predictor_32x32_neon,
                        &vpx_d63_predictor_32x32_c, 32, 8),
-        IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c,
-                       4, 8),
-        IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c,
-                       8, 8),
-        IntraPredParam(&vpx_d117_predictor_16x16_neon,
-                       &vpx_d117_predictor_16x16_c, 16, 8),
-        IntraPredParam(&vpx_d117_predictor_32x32_neon,
-                       &vpx_d117_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
                        4, 8),
         IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 4760a295b9..02a05aae53 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -545,238 +545,6 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
-void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
-  uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
-
-  az = load_unaligned_u8_4x1(above - 1);
-  a0 = load_unaligned_u8_4x1(above + 0);
-  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
-  l0az = vext_u8(vld1_dup_u8(left), az, 7);
-
-  col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
-  col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
-
-  d0 = vrhadd_u8(az, a0);
-  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
-  d2 = vext_u8(col0, d0, 7);
-  d3 = vext_u8(col1, d1, 7);
-
-  store_u8_4x1(dst + 0 * stride, d0);
-  store_u8_4x1(dst + 1 * stride, d1);
-  store_u8_4x1(dst + 2 * stride, d2);
-  store_u8_4x1(dst + 3 * stride, d3);
-}
-
-void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
-
-  az = vld1_u8(above - 1);
-  a0 = vld1_u8(above + 0);
-  // [ left[0], above[-1], ... , above[5] ]
-  l0az = vext_u8(vld1_dup_u8(left), az, 7);
-
-  l0 = vld1_u8(left + 0);
-  l1 = vld1_u8(left + 1);
-  // [ above[-1], left[0], ... , left[6] ]
-  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
-
-  // d0[0] = AVG2(above[-1], above[0])
-  // d0[1] = AVG2(above[0], above[1])
-  // ...
-  // d0[7] = AVG2(above[6], above[7])
-  d0 = vrhadd_u8(az, a0);
-
-  // d1[0] = AVG3(left[0], above[-1], above[0])
-  // d1[1] = AVG3(above[-1], above[0], above[1])
-  // ...
-  // d1[7] = AVG3(above[5], above[6], above[7])
-  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
-
-  // The ext instruction shifts elements in from the end of the vector rather
-  // than the start, so reverse the vector to put the elements to be shifted in
-  // at the end:
-  // col0[7] = AVG3(above[-1], left[0], left[1])
-  // col0[6] = AVG3(left[0], left[1], left[2])
-  // ...
-  // col0[0] = AVG3(left[6], left[7], left[8])
-  col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
-
-  // We don't care about the first parameter to this uzp since we only ever use
-  // the high three elements, we just use col0 again since it is already
-  // available:
-  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
-  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
-  col0_even = vuzp_u8(col0, col0).val[1];
-  col0_odd = vuzp_u8(col0, col0).val[0];
-
-  // Incrementally shift more elements from col0 into d0/1:
-  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
-  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
-  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
-  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
-  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
-  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
-  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
-  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
-  vst1_u8(dst + 0 * stride, d0);
-  vst1_u8(dst + 1 * stride, d1);
-  vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
-  vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
-  vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
-  vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
-  vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
-  vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
-}
-
-void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
-  uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
-
-  az = vld1q_u8(above - 1);
-  a0 = vld1q_u8(above + 0);
-  // [ left[0], above[-1], ... , above[13] ]
-  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
-
-  l0 = vld1q_u8(left + 0);
-  l1 = vld1q_u8(left + 1);
-  // [ above[-1], left[0], ... , left[14] ]
-  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
-
-  d0 = vrhaddq_u8(az, a0);
-  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
-
-  col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
-  col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
-
-  col0_even = vuzpq_u8(col0, col0).val[1];
-  col0_odd = vuzpq_u8(col0, col0).val[0];
-
-  vst1q_u8(dst + 0 * stride, d0);
-  vst1q_u8(dst + 1 * stride, d1);
-  vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
-  vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
-  vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
-  vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
-  vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
-  vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
-  vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
-  vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
-  vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
-  vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
-  vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
-  vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
-  vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
-  vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
-}
-
-void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
-  uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
-      l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
-
-  az = vld1q_u8(above - 1);
-  a0 = vld1q_u8(above + 0);
-  a14 = vld1q_u8(above + 14);
-  a15 = vld1q_u8(above + 15);
-  a16 = vld1q_u8(above + 16);
-  // [ left[0], above[-1], ... , above[13] ]
-  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
-
-  l0 = vld1q_u8(left + 0);
-  l1 = vld1q_u8(left + 1);
-  l15 = vld1q_u8(left + 15);
-  l16 = vld1q_u8(left + 16);
-  l17 = vld1q_u8(left + 17);
-  // [ above[-1], left[0], ... , left[14] ]
-  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
-
-  d0_lo = vrhaddq_u8(az, a0);
-  d0_hi = vrhaddq_u8(a15, a16);
-  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
-  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
-
-  col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
-  col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
-
-  col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
-  col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
-
-  col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
-  col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
-
-  vst1q_u8(dst + 0 * stride + 0, d0_lo);
-  vst1q_u8(dst + 0 * stride + 16, d0_hi);
-  vst1q_u8(dst + 1 * stride + 0, d1_lo);
-  vst1q_u8(dst + 1 * stride + 16, d1_hi);
-  vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
-  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
-  vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
-  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
-  vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
-  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
-  vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
-  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
-  vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
-  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
-  vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
-  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
-  vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
-  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
-  vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
-  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
-  vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
-  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
-  vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
-  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
-  vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
-  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
-  vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
-  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
-  vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
-  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
-  vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
-  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
-  vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
-  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
-  vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
-  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
-  vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
-  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
-  vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
-  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
-  vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
-  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
-  vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
-  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
-  vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
-  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
-  vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
-  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
-  vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
-  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
-  vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
-  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
-  vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
-  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
-  vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
-  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
-  vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
-  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
-  vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
-  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
-  vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
-  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
-  vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
-  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
-}
-
-// -----------------------------------------------------------------------------
-
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x8_t XA0123 = vld1_u8(above - 1);
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 3bcfa7a541..c50ab93c5a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -57,7 +57,6 @@ ()
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d117_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
@@ -102,7 +101,6 @@ ()
 specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d117_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_8x8 neon/;
@@ -143,7 +141,6 @@ ()
 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d117_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_16x16 neon/;
@@ -182,7 +179,6 @@ ()
 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d117_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_32x32 neon/;

From f5dfa780ce087af40b39a05b45c4798ad70b48c8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 3 Mar 2023 20:56:29 +0000
Subject: [PATCH 571/926] disable vpx_get4x4sse_cs_neon

This function causes a heap overflow in the tests:
[ RUN      ] NEON/VpxSseTest.RefSse/0
=================================================================
==876922==ERROR: AddressSanitizer: heap-buffer-overflow on address
0xffff8949d903 at pc 0x000000dd95d4 bp 0xfffffdd7f260 sp 0xfffffdd7f258
READ of size 8 at 0xffff8949d903 thread T0
    #0 0xdd95d0 in vpx_get4x4sse_cs_neon
       vpx_dsp/arm/variance_neon.c:556:10
    #1 0x9d4894 in (anonymous namespace)::MainTestClass<unsigned int
       (*)(unsigned char const*, int, unsigned char const*,
           int)>::RefTestSse() test/variance_test.cc:531:5
    #2 0x9d4894 in (anonymous
       namespace)::VpxSseTest_RefSse_Test::TestBody()
           test/variance_test.cc:772:30
    ...

0xffff8949d903 is located 3 bytes to the right of 16-byte region
[0xffff8949d8f0,0xffff8949d900)
allocated by thread T0 here:
    #0 0x5fd050 in operator new[](unsigned long) (test_libvpx+0x5fd050)
    #1 0x9d3e04 in (anonymous namespace)::MainTestClass<unsigned int
       (*)(unsigned char const*, int, unsigned char const*,
           int)>::SetUp() test/variance_test.cc:299:12

Bug: webm:1794
Change-Id: I4bc681eb9a436743ef8bfe2a2abae59ce754309c
---
 test/variance_test.cc        | 3 +++
 vpx_dsp/arm/variance_neon.c  | 6 ++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +++-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 1359bc4baf..237d595bb7 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1446,9 +1446,12 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed.
+#if 0
 INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest,
                          ::testing::Values(SseParams(2, 2,
                                                      &vpx_get4x4sse_cs_neon)));
+#endif
 
 INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest,
                          ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon),
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index feff980c93..76c2a15863 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -433,6 +433,8 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
   return *sse;
 }
 
+// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed.
+#if 0
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
                                    int ref_stride) {
@@ -466,6 +468,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
 
   return vget_lane_u32(sse, 0);
 }
+#endif  // 0
 
 #else  // !defined(__ARM_FEATURE_DOTPROD)
 
@@ -532,6 +535,8 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
   return *sse;
 }
 
+// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed.
+#if 0
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
                                    int ref_stride) {
@@ -572,6 +577,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
 
   return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
 }
+#endif  // 0
 
 #endif  // defined(__ARM_FEATURE_DOTPROD)
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c50ab93c5a..2301fbe328 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1152,8 +1152,10 @@ ()
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
 
+  # TODO(https://crbug.com/webm/1794): enable neon after heap overflow is
+  # fixed.
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa vsx/;
+  specialize qw/vpx_get4x4sse_cs msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
   specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;

From 5fae248f2a8af49bc82590ec1d397c8535859b0e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 3 Mar 2023 15:33:16 -0800
Subject: [PATCH 572/926] disable vp8_sixtap_predict16x16_neon

This causes various buffer overflows in the tests:

[ RUN      ] NEON/SixtapPredictTest.TestWithPresetData/0
=================================================================
==22346==ERROR: AddressSanitizer: global-buffer-overflow on address
0x0000012b4a5b at pc 0x000000df0f60 bp 0xffffcf6e64b0 sp 0xffffcf6e64a8
READ of size 8 at 0x0000012b4a5b thread T0
    #0 0xdf0f5c in vp8_sixtap_predict16x16_neon
       vp8/common/arm/neon/sixtappredict_neon.c:1507:13
    #1 0x8819e4 in (anonymous
        namespace)::SixtapPredictTest_TestWithPresetData_Test::TestBody()
       test/predict_test.cc:293:3
    ...

0x0000012b4a5b is located 2 bytes to the right of global variable
'kTestData' defined in '../test/predict_test.cc:237:24' (0x12b48a0) of
size 441

[ RUN      ] NEON/SixtapPredictTest.TestWithRandomData/0
=================================================================
==22338==ERROR: AddressSanitizer: heap-buffer-overflow on address
0xffff8b5321fb at pc 0x000000df0f60 bp 0xfffff7e0cf30 sp 0xfffff7e0cf28
READ of size 8 at 0xffff8b5321fb thread T0
    #0 0xdf0f5c in vp8_sixtap_predict16x16_neon
       vp8/common/arm/neon/sixtappredict_neon.c:1507:13
    #1 0x87d4c0 in (anonymous
       namespace)::PredictTestBase::TestWithRandomData(void (*)(unsigned
       char*, int, int, int, unsigned char*, int))
       test/predict_test.cc:170:9
    ...

0xffff8b5321fb is located 2 bytes to the right of 441-byte region
[0xffff8b532040,0xffff8b5321f9)
allocated by thread T0 here:
    #0 0x5fd4f0 in operator new[](unsigned long) (test_libvpx+0x5fd4f0)
    #1 0x87c2e0 in (anonymous namespace)::PredictTestBase::SetUp()
       test/predict_test.cc:47:12
    #2 0x87d074 in non-virtual thunk to (anonymous
       namespace)::PredictTestBase::SetUp() test/predict_test.cc
    ...

Bug: webm:1795
Change-Id: I32213a381eef91547d00f88acf90f1cf2ec2ea75
---
 test/predict_test.cc                     | 4 +++-
 vp8/common/arm/neon/sixtappredict_neon.c | 4 ++++
 vp8/common/rtcd_defs.pl                  | 4 +++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/test/predict_test.cc b/test/predict_test.cc
index 7472970576..e49d98272e 100644
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -307,7 +307,9 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, SixtapPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
+    ::testing::Values(/*TODO(https://crbug.com/webm/1795): enable this after
+                        buffer overflows are fixed.
+                      make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),*/
                       make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
                       make_tuple(8, 4, &vp8_sixtap_predict8x4_neon),
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_neon)));
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index 48e86d3278..4960d16516 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -1253,6 +1253,9 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
   return;
 }
 
+// TODO(https://crbug.com/webm/1795): enable this after buffer overflows are
+// fixed.
+#if 0
 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
@@ -1728,3 +1731,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
   }
   return;
 }
+#endif  // 0
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 739a612847..05e67ce11b 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -146,7 +146,9 @@ ()
 # Subpixel
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/;
+# TODO(https://crbug.com/webm/1795): enable neon after buffer overflows are
+# fixed.
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/;

From 0384a2aab79a8b004257ba32d7824e8defb12c55 Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Sat, 5 Nov 2022 09:53:07 +0900
Subject: [PATCH 573/926] reland: quantize: simplify 32x32_b args

Allocate mb_plane_ on the heap to ensure src is aligned.

Now that all the implementations of the 32x32 quantize are in
intrinsics we can reference struct members directly. Saves
pushing them to the stack.

n_coeffs is not used at all for this function.

Change-Id: Ib551f7f583977602504d962b72063bc6eda9dda9
---
 test/vp9_quantize_test.cc    | 287 +++++++++++++++++++++--------------
 vp9/encoder/vp9_block.h      |   1 +
 vp9/encoder/vp9_encodemb.c   |   6 +-
 vpx_dsp/arm/quantize_neon.c  |  17 +--
 vpx_dsp/quantize.c           |  17 ++-
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   5 +-
 vpx_dsp/x86/quantize_avx.c   |  30 +---
 vpx_dsp/x86/quantize_avx2.c  |  15 +-
 vpx_dsp/x86/quantize_sse2.h  |  28 ++++
 vpx_dsp/x86/quantize_ssse3.c |  35 +----
 10 files changed, 243 insertions(+), 198 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 587cec6923..6a8f1dafb1 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -26,6 +26,7 @@
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
@@ -38,8 +39,7 @@ namespace {
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             const int16_t *zbin, const int16_t *round,
-                             const int16_t *quant, const int16_t *quant_shift,
+                             const macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
                              const int16_t *scan, const int16_t *iscan);
@@ -47,6 +47,41 @@ typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
+// Wrapper which takes a macroblock_plane.
+typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count,
+                                 const int16_t *zbin, const int16_t *round,
+                                 const int16_t *quant,
+                                 const int16_t *quant_shift, tran_low_t *qcoeff,
+                                 tran_low_t *dqcoeff, const int16_t *dequant,
+                                 uint16_t *eob, const int16_t *scan,
+                                 const int16_t *iscan);
+
+template <QuantizeBaseFunc fn>
+void QuantWrapper(const tran_low_t *coeff, intptr_t count,
+                  const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                  tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                  const int16_t *scan, const int16_t *iscan) {
+  fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
+     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
+// Wrapper for 32x32 version which does not use count
+typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
+                                  const macroblock_plane *const mb_plane,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  const int16_t *dequant, uint16_t *eob,
+                                  const int16_t *scan, const int16_t *iscan);
+
+template <Quantize32x32Func fn>
+void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
+                       const macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                       const int16_t *dequant, uint16_t *eob,
+                       const int16_t *scan, const int16_t *iscan) {
+  (void)count;
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
                                const int16_t *round, const int16_t *quant,
@@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
 
 template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
-                    const int16_t *zbin, const int16_t *round,
-                    const int16_t *quant, const int16_t *quant_shift,
-                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
-                    const int16_t *iscan) {
-  (void)zbin;
-  (void)quant_shift;
-
-  fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+                    const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                    tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                    const int16_t *scan, const int16_t *iscan) {
+  fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
+     dequant, eob, scan, iscan);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -119,17 +150,21 @@ class VP9QuantizeBase : public AbstractBench {
 #else
     max_value_ = (1 << bit_depth_) - 1;
 #endif
-    zbin_ptr_ =
+
+    mb_plane_ = reinterpret_cast<macroblock_plane *>(
+        vpx_memalign(16, sizeof(macroblock_plane)));
+
+    zbin_ptr_ = mb_plane_->zbin =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
-    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+    round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
-    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+    quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
-    round_ptr_ =
+    round_ptr_ = mb_plane_->round =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
-    quant_ptr_ =
+    quant_ptr_ = mb_plane_->quant =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
-    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+    quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
     dequant_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
@@ -139,6 +174,7 @@ class VP9QuantizeBase : public AbstractBench {
   }
 
   ~VP9QuantizeBase() {
+    vpx_free(mb_plane_);
     vpx_free(zbin_ptr_);
     vpx_free(round_fp_ptr_);
     vpx_free(quant_fp_ptr_);
@@ -146,6 +182,7 @@ class VP9QuantizeBase : public AbstractBench {
     vpx_free(quant_ptr_);
     vpx_free(quant_shift_ptr_);
     vpx_free(dequant_ptr_);
+    mb_plane_ = nullptr;
     zbin_ptr_ = nullptr;
     round_fp_ptr_ = nullptr;
     quant_fp_ptr_ = nullptr;
@@ -157,9 +194,10 @@ class VP9QuantizeBase : public AbstractBench {
   }
 
  protected:
+  macroblock_plane *mb_plane_;
   int16_t *zbin_ptr_;
-  int16_t *round_fp_ptr_;
   int16_t *quant_fp_ptr_;
+  int16_t *round_fp_ptr_;
   int16_t *round_ptr_;
   int16_t *quant_ptr_;
   int16_t *quant_shift_ptr_;
@@ -193,8 +231,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 };
 
 void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-               quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+  quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
                dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
                scan_->iscan);
 }
@@ -266,8 +303,8 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&timer);
         for (int n = 0; n < kNumTests; ++n) {
-          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
-                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                           ref_qcoeff.TopLeftPixel(),
                            ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                            scan_->scan, scan_->iscan);
         }
@@ -275,10 +312,9 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&simd_timer);
         for (int n = 0; n < kNumTests; ++n) {
-          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
-                       scan_->scan, scan_->iscan);
+          quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                       qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+                       dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
         }
         vpx_usec_timer_mark(&simd_timer);
 
@@ -417,15 +453,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -475,15 +510,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -510,28 +544,35 @@ using std::make_tuple;
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
+        make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                    &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true)));
@@ -541,11 +582,12 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_ssse3>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true),
@@ -555,13 +597,14 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
-INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
-                         ::testing::Values(make_tuple(&vpx_quantize_b_avx,
-                                                      &vpx_quantize_b_c,
-                                                      VPX_BITS_8, 16, false),
-                                           make_tuple(&vpx_quantize_b_32x32_avx,
-                                                      &vpx_quantize_b_32x32_c,
-                                                      VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_avx>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
@@ -577,22 +620,29 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
                    &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
                    32, true),
-        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
@@ -602,11 +652,12 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true),
-                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_avx2,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
+                      make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
@@ -615,22 +666,29 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
@@ -639,11 +697,12 @@ INSTANTIATE_TEST_SUITE_P(
 #else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_neon,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                  16, true),
@@ -683,9 +742,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32, false),
+        make_tuple(&QuantWrapper<vpx_quantize_b_c>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 3e2c9a3c35..da01c346d9 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,6 +13,7 @@
 
 #include "vpx_util/vpx_thread.h"
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9dcf..4910dc20f5 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                           p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+      vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
                            scan_order->scan, scan_order->iscan);
       break;
     case TX_16X16:
@@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+        vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
                              scan_order->scan, scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index 9c227d560f..e81738a7bb 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                const int16x8_t dequant,
@@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
@@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int i;
 
   // Only the first element of each vector is DC.
-  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  int16x8_t quant = vld1q_s16(quant_ptr);
-  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
@@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif  // __aarch64__
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
-  (void)n_coeffs;
   (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 5d6ba64a8a..212db45c88 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,6 +14,7 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
@@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 #endif
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
   (void)iscan;
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 2301fbe328..deaf4afe8c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -17,6 +17,9 @@ ()
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+#endif
 
 EOF
 }
@@ -717,7 +720,7 @@ ()
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 7d83527216..d52f6c6644 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -140,15 +140,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *zbin_ptr, const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
                               const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
 
@@ -160,26 +157,9 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  shift = _mm_slli_epi16(shift, 1);
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 28f7c9c7da..a8412c5b8e 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
     const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
@@ -250,23 +251,19 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
   }
 }
 
-void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  (void)n_coeffs;
   (void)scan;
 
-  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
-                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
-                     &v_quant_shift, 1);
+  load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
+                     mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
+                     mb_plane->quant_shift, &v_quant_shift, 1);
 
   // Do DC and first 15 AC.
   v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 27bfb4e41b..fe42fee018 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,6 +15,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
                                  const int16_t *round_ptr, __m128i *round,
@@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
   *shift = _mm_load_si128((const __m128i *)shift_ptr);
 }
 
+static INLINE void load_b_values32x32(
+    const struct macroblock_plane *const mb_plane, __m128i *zbin,
+    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+    __m128i *dequant, __m128i *shift) {
+  const __m128i one = _mm_set1_epi16(1);
+  // The 32x32 halves zbin and round.
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  // Shift with rounding.
+  *zbin = _mm_add_epi16(*zbin, one);
+  *zbin = _mm_srli_epi16(*zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  *zbin = _mm_sub_epi16(*zbin, one);
+
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *round = _mm_add_epi16(*round, one);
+  *round = _mm_srli_epi16(*round, 1);
+
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  *shift = _mm_slli_epi16(*shift, 1);
+}
+
 static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
                                   const int16_t *quant_ptr, __m128i *quant,
                                   const int16_t *dequant_ptr,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 476230286d..6fe54d7d98 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,6 +16,7 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   int index;
 
   __m128i zbin, round, quant, dequant, shift;
@@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  // I suspect this is not technically OK because quant_shift can be up
-  // to 1 << 16 and shifting up again will outrange that, but the test is not
-  // comprehensive enough to catch that and "it's been that way forever"
-  shift = _mm_slli_epi16(shift, 1);
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);

From 62827575462ecdb7790b60f5da302b6395cef798 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Wed, 1 Mar 2023 22:44:38 +0000
Subject: [PATCH 574/926] Implement highbd_d63_predictor using Neon

Add Neon implementations of the highbd d63 predictor for 4x4, 8x8, 16x16
and 32x32 block sizes. Also update tests to add new corresponding cases.

This re-lands commit 7cdf139e3d6237386e0f93bdb0bdc1b459c663bf,
previously reverted in 7478b7e4e481562a4a13f233acb66a60462e1934.

Compared to the previous implementation attempt we now correctly match
the behaviour of the C code when handling the final element loaded from
the 'above' input array. In particular:

- The C code for a 4x4 block performs a full average of the last element
  rather than duplicating the final element from the input 'above'
  array.

- The C code for other block sizes performs a full average for the
  stride=0 and stride=1, and otherwise shifts in duplicates of the final
  element from the input 'above' array. Notably this shifting for later
  strides _replaces_ the final element which we previously performed an
  average on (see {d0,d1}_ext in the code).

It is worth noting that this difference is not caught by the existing
VP9HighbdIntraPredTest test cases since the test vector initialisation
contains this loop:

    for (int x = block_size; x < 2 * block_size; x++) {
        above_row_[x] = above_row_[block_size - 1];
    }

Since AVG2(a, a) and AVG3(a, a, a) are simply 'a', such differences in
behaviour for the final element are not observed.

Tested on AArch64 with:

- ./test_libvpx --gtest_filter="*VP9HighbdIntraPredTest*"
- ./test_libvpx --gtest_filter="*VP9/TestVectorTest.MD5Match*"
- ./test_libvpx --gtest_filter="*VP9/ExternalFrameBufferMD5Test*"

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    2.43
Neoverse N1 |  LLVM 15 |   8x8 |    3.92
Neoverse N1 |  LLVM 15 | 16x16 |    3.19
Neoverse N1 |  LLVM 15 | 32x32 |    4.13
Neoverse N1 |   GCC 12 |   4x4 |    2.92
Neoverse N1 |   GCC 12 |   8x8 |    6.51
Neoverse N1 |   GCC 12 | 16x16 |    4.55
Neoverse N1 |   GCC 12 | 32x32 |    3.18
Neoverse V1 |  LLVM 15 |   4x4 |    1.99
Neoverse V1 |  LLVM 15 |   8x8 |    3.65
Neoverse V1 |  LLVM 15 | 16x16 |    3.72
Neoverse V1 |  LLVM 15 | 32x32 |    3.26
Neoverse V1 |   GCC 12 |   4x4 |    2.39
Neoverse V1 |   GCC 12 |   8x8 |    4.76
Neoverse V1 |   GCC 12 | 16x16 |    3.24
Neoverse V1 |   GCC 12 | 32x32 |    2.44

Change-Id: Iefaa774d6a20388b523eaa7f5df6bc5f5cf249e4
---
 test/test_intra_pred_speed.cc       |  12 +-
 test/vp9_intrapred_test.cc          |  24 ++
 vpx_dsp/arm/highbd_intrapred_neon.c | 326 ++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl        |   8 +-
 4 files changed, 362 insertions(+), 8 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 24af471eaa..e721a459ad 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -566,14 +566,16 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
     vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon,
-    nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_4x4_neon)
+    nullptr, nullptr, vpx_highbd_d63_predictor_4x4_neon,
+    vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
     vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon,
-    nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_8x8_neon)
+    nullptr, nullptr, vpx_highbd_d63_predictor_8x8_neon,
+    vpx_highbd_tm_predictor_8x8_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
     vpx_highbd_dc_left_predictor_16x16_neon,
@@ -581,7 +583,8 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
     vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
     vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon,
-    nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon)
+    nullptr, nullptr, vpx_highbd_d63_predictor_16x16_neon,
+    vpx_highbd_tm_predictor_16x16_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
     vpx_highbd_dc_left_predictor_32x32_neon,
@@ -589,7 +592,8 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
     vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
     vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon,
-    nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon)
+    nullptr, nullptr, vpx_highbd_d63_predictor_32x32_neon,
+    vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 83e371df6e..c4e0e78ac5 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -848,6 +848,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
                              &vpx_highbd_d117_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
@@ -932,6 +940,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
                              &vpx_highbd_d117_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
@@ -1016,6 +1032,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
                              &vpx_highbd_d117_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index dc1b27dc10..6b6ad95c12 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -453,6 +453,332 @@ void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1_u16(above + 0);
+  a1 = vld1_u16(above + 1);
+  a2 = vld1_u16(above + 2);
+  a3 = vld1_u16(above + 3);
+
+  d0 = vrhadd_u16(a0, a1);
+  d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
+  d2 = vrhadd_u16(a1, a2);
+  d3 = vrhadd_u16(vhadd_u16(a1, a3), a2);
+
+  // Note that here we are performing a full avg calculation for the final
+  // elements rather than storing a duplicate of above[3], which differs
+  // (correctly) from the general scheme employed by the bs={8,16,32}
+  // implementations in order to match the original C implementation.
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, d2);
+  vst1_u16(dst + 3 * stride, d3);
+}
+
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a7 = vld1q_dup_u16(above + 7);
+
+  d0 = vrhaddq_u16(a0, a1);
+  d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  // We want to store:
+  // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7] ]
+  // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7] ]
+  // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7] ]
+  // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7] ]
+  // stride=6 [ d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7],  a[7] ]
+  // stride=7 [ d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7],  a[7] ]
+  // Note in particular that d0[7] and d1[7] are only ever referenced in the
+  // stride=0 and stride=1 cases respectively, and in later strides are
+  // replaced by a copy of above[7]. These are equivalent if for i>7,
+  // above[i]==above[7], however that is not always the case.
+
+  // Strip out d0[7] and d1[7] so that we can replace it with an additional
+  // copy of above[7], the first vector here doesn't matter so just reuse
+  // d0/d1.
+  d0_ext = vextq_u16(d0, d0, 7);
+  d1_ext = vextq_u16(d1, d1, 7);
+
+  // Shuffle in duplicates of above[7] and store.
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4));
+}
+
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+  uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a15 = vld1q_dup_u16(above + 15);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+
+  // Strip out the final element of d0/d1 so that we can replace it with an
+  // additional copy of above[7], the first vector here doesn't matter so just
+  // reuse the same vector.
+  d0_ext = vextq_u16(d0[1], d0[1], 7);
+  d1_ext = vextq_u16(d1[1], d1[1], 7);
+
+  // Shuffle in duplicates of above[7] and store. Note that cases involving
+  // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+  // element from above.
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4));
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6));
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, a15);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, a15);
+}
+
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+  uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
+      d1[4], d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a18 = vld1q_u16(above + 18);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a26 = vld1q_u16(above + 26);
+  a31 = vld1q_dup_u16(above + 31);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d0[2] = vrhaddq_u16(a16, a17);
+  d0[3] = vrhaddq_u16(a24, a25);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
+
+  // Strip out the final element of d0/d1 so that we can replace it with an
+  // additional copy of above[7], the first vector here doesn't matter so just
+  // reuse the same vector.
+  d0_ext = vextq_u16(d0[3], d0[3], 7);
+  d1_ext = vextq_u16(d1[3], d1[3], 7);
+
+  // Shuffle in duplicates of above[7] and store. Note that cases involving
+  // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+  // element from above.
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 14 * stride + 24, a31);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 15 * stride + 24, a31);
+
+  vst1q_u16(dst + 16 * stride + 0, d0[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[2]);
+  vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1));
+  vst1q_u16(dst + 16 * stride + 24, a31);
+  vst1q_u16(dst + 17 * stride + 0, d1[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[2]);
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1));
+  vst1q_u16(dst + 17 * stride + 24, a31);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2));
+  vst1q_u16(dst + 18 * stride + 24, a31);
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2));
+  vst1q_u16(dst + 19 * stride + 24, a31);
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3));
+  vst1q_u16(dst + 20 * stride + 24, a31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3));
+  vst1q_u16(dst + 21 * stride + 24, a31);
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4));
+  vst1q_u16(dst + 22 * stride + 24, a31);
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4));
+  vst1q_u16(dst + 23 * stride + 24, a31);
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5));
+  vst1q_u16(dst + 24 * stride + 24, a31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5));
+  vst1q_u16(dst + 25 * stride + 24, a31);
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6));
+  vst1q_u16(dst + 26 * stride + 24, a31);
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6));
+  vst1q_u16(dst + 27 * stride + 24, a31);
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7));
+  vst1q_u16(dst + 28 * stride + 24, a31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7));
+  vst1q_u16(dst + 29 * stride + 24, a31);
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 30 * stride + 16, a31);
+  vst1q_u16(dst + 30 * stride + 24, a31);
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 31 * stride + 16, a31);
+  vst1q_u16(dst + 31 * stride + 24, a31);
+}
+
+// -----------------------------------------------------------------------------
+
 void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 652c553f97..48552a6f8d 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -217,7 +217,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
@@ -256,7 +256,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
@@ -295,7 +295,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
@@ -334,7 +334,7 @@ ()
   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;

From 8b0a60f91c2002d2ff319f755622b60dd70e213e Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 9 Feb 2023 16:12:59 +0000
Subject: [PATCH 575/926] Implement d153_predictor using Neon

Add Neon implementations of the d153 predictor for 4x4, 8x8, 16x16 and
32x32 block sizes. Also update tests to add new corresponding cases.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    1.59
Neoverse N1 |  LLVM 15 |   8x8 |    4.46
Neoverse N1 |  LLVM 15 | 16x16 |    8.77
Neoverse N1 |  LLVM 15 | 32x32 |   15.21
Neoverse N1 |   GCC 12 |   4x4 |    1.90
Neoverse N1 |   GCC 12 |   8x8 |    4.70
Neoverse N1 |   GCC 12 | 16x16 |    9.55
Neoverse N1 |   GCC 12 | 32x32 |    5.95
Neoverse V1 |  LLVM 15 |   4x4 |    2.89
Neoverse V1 |  LLVM 15 |   8x8 |    6.94
Neoverse V1 |  LLVM 15 | 16x16 |   10.20
Neoverse V1 |  LLVM 15 | 32x32 |   15.63
Neoverse V1 |   GCC 12 |   4x4 |    4.45
Neoverse V1 |   GCC 12 |   8x8 |    7.71
Neoverse V1 |   GCC 12 | 16x16 |    9.08
Neoverse V1 |   GCC 12 | 32x32 |    7.93

Change-Id: I910692b14917cde8a8952fab5b9c78bed7f7c6ad
---
 test/test_intra_pred_speed.cc |  16 +--
 test/vp9_intrapred_test.cc    |   8 ++
 vpx_dsp/arm/intrapred_neon.c  | 254 ++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |   8 +-
 4 files changed, 274 insertions(+), 12 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index e721a459ad..871f778116 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -270,31 +270,31 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
                 vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon,
-                nullptr, nullptr, vpx_d63_predictor_4x4_neon,
-                vpx_tm_predictor_4x4_neon)
+                vpx_d153_predictor_4x4_neon, nullptr,
+                vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
                 vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon,
-                nullptr, nullptr, vpx_d63_predictor_8x8_neon,
-                vpx_tm_predictor_8x8_neon)
+                vpx_d153_predictor_8x8_neon, nullptr,
+                vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
                 vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon,
-                nullptr, nullptr, vpx_d63_predictor_16x16_neon,
-                vpx_tm_predictor_16x16_neon)
+                vpx_d153_predictor_16x16_neon, nullptr,
+                vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
                 vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon,
-                nullptr, nullptr, vpx_d63_predictor_32x32_neon,
-                vpx_tm_predictor_32x32_neon)
+                vpx_d153_predictor_32x32_neon, nullptr,
+                vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index c4e0e78ac5..a2ea1334d8 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -267,6 +267,14 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d135_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d135_predictor_32x32_neon,
                        &vpx_d135_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d153_predictor_4x4_neon, &vpx_d153_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d153_predictor_8x8_neon, &vpx_d153_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d153_predictor_16x16_neon,
+                       &vpx_d153_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d153_predictor_32x32_neon,
+                       &vpx_d153_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
                        &vpx_dc_128_predictor_4x4_c, 4, 8),
         IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 629c7170c6..1ff4bf2955 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -1081,6 +1081,260 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   d135_store_32x2(&dst, stride, row_0, row_1, row_2);
 }
 
+// -----------------------------------------------------------------------------
+
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = load_unaligned_u8_4x1(left + 0);
+  l1 = load_unaligned_u8_4x1(left + 1);
+  // [ above[-1], left[0], left[1], left[2], x, x, x, x ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  d0 = vrhadd_u8(azl0, l0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+  d02 = vrev64_u8(vzip_u8(d0, d2).val[0]);
+
+  store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7));
+  store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5));
+  store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3));
+  store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1));
+}
+
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vext_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], left[0])
+  // d0[1] = AVG2(left[0], left[1])
+  // ...
+  // d0[7] = AVG2(left[6], left[7])
+  d0 = vrhadd_u8(azl0, l0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // d2[0] = AVG3(above[-1], left[0], left[1])
+  // d2[1] = AVG3(left[0], left[1], left[2])
+  // ...
+  // d2[6] = AVG3(left[5], left[6], left[7])
+  // d2[7] = x (don't care)
+  d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vectors to put the elements to be shifted
+  // in at the end. The lowest lane of d02_lo is unused.
+  d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0];
+  d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+  // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+  // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+  // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+  // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+  vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7));
+  vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5));
+  vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3));
+  vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1));
+  vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7));
+  vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5));
+  vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3));
+  vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[15], x ]
+  l1 = vextq_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(azl0, l0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+
+  d0 = vrev64q_u8(vextq_u8(d0, d0, 8));
+  d2 = vrev64q_u8(vextq_u8(d2, d2, 8));
+
+  // The lowest lane of d02_lo is unused.
+  d02_lo = vzipq_u8(d2, d0).val[0];
+  d02_hi = vzipq_u8(d2, d0).val[1];
+
+  vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15));
+  vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13));
+  vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo,
+      d0_hi, d1_lo, d1_hi, d2_lo, d2_hi;
+  uint8x16x2_t d02_hi, d02_lo;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  // The last lane here is unused, reading left[32] would cause a buffer
+  // over-read (observed as an address-sanitizer failure), so just fill with a
+  // duplicate of left[16] to avoid needing to materialize a zero:
+  // [ left[17], ... , left[31], x ]
+  l17 = vextq_u8(l16, l16, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(azl0, l0);
+  d0_hi = vrhaddq_u8(l15, l16);
+
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  // The highest lane of d2_hi is unused.
+  d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8));
+  d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8));
+
+  d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8));
+  d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8));
+
+  // d02_hi.val[0][0] is unused here.
+  d02_hi = vzipq_u8(d2_hi, d0_hi);
+  d02_lo = vzipq_u8(d2_lo, d0_lo);
+
+  vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15));
+  vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+}
+
+// -----------------------------------------------------------------------------
+
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 48552a6f8d..1423de2689 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -63,7 +63,7 @@ ()
 specialize qw/vpx_d135_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_4x4 ssse3/;
+specialize qw/vpx_d153_predictor_4x4 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
@@ -108,7 +108,7 @@ ()
 specialize qw/vpx_d135_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_8x8 ssse3/;
+specialize qw/vpx_d153_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
@@ -149,7 +149,7 @@ ()
 specialize qw/vpx_d135_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_16x16 ssse3/;
+specialize qw/vpx_d153_predictor_16x16 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
@@ -188,7 +188,7 @@ ()
 specialize qw/vpx_d135_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_32x32 ssse3/;
+specialize qw/vpx_d153_predictor_32x32 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;

From 7e88600bf90787765c3e98d5f0bd7cf72b74d6ba Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Mon, 6 Mar 2023 09:27:41 +0000
Subject: [PATCH 576/926] Implement d117_predictor using Neon

Add Neon implementations of the d117 predictor for 4x4, 8x8, 16x16 and
32x32 block sizes. Also update tests to add new corresponding cases.

This re-lands commit 360e9069b6cc1dd3a004728b876fb923413f4b11,
previously reverted in commit 394de691a0ef570fc49943f565ad53ee0d22a7f3.

The implementation is mostly identical to the original but with an
adjustment to how data is loaded from the `left` array. In particular
the left array cannot be guaranteed to be larger than the block size, so
the read of e.g. `left[32]` in the `bs=32` case is not valid. This turns
out to be not a problem since the last lane loaded in this case is
unused. I have added comments in the code to explain why this is the
case.

Since we cannot load the last element directly, we instead construct it
from the previous aligned read. This seems to have an inconsistent
affect on performance, improving by up to 10% in some cases and
regressing by up to 10% on others. Either way it is still significantly
faster than the original C code.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    1.88
Neoverse N1 |  LLVM 15 |   8x8 |    5.19
Neoverse N1 |  LLVM 15 | 16x16 |    9.63
Neoverse N1 |  LLVM 15 | 32x32 |   13.85
Neoverse N1 |   GCC 12 |   4x4 |    2.04
Neoverse N1 |   GCC 12 |   8x8 |    4.62
Neoverse N1 |   GCC 12 | 16x16 |    9.79
Neoverse N1 |   GCC 12 | 32x32 |    4.69
Neoverse V1 |  LLVM 15 |   4x4 |    1.75
Neoverse V1 |  LLVM 15 |   8x8 |    6.71
Neoverse V1 |  LLVM 15 | 16x16 |    9.62
Neoverse V1 |  LLVM 15 | 32x32 |   13.81
Neoverse V1 |   GCC 12 |   4x4 |    1.75
Neoverse V1 |   GCC 12 |   8x8 |    6.01
Neoverse V1 |   GCC 12 | 16x16 |    6.91
Neoverse V1 |   GCC 12 | 32x32 |    4.39

Change-Id: Ia0977ff0b0eba2c41c7884b64e7c22ff9bc9549d
---
 test/test_intra_pred_speed.cc |  20 +--
 test/vp9_intrapred_test.cc    |   8 ++
 vpx_dsp/arm/intrapred_neon.c  | 252 ++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |   4 +
 4 files changed, 276 insertions(+), 8 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 30817553ff..24af471eaa 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -269,28 +269,32 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr,
-                vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
+                vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon,
+                nullptr, nullptr, vpx_d63_predictor_4x4_neon,
+                vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
-                vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr,
-                vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
+                vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon,
+                nullptr, nullptr, vpx_d63_predictor_8x8_neon,
+                vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
-                vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr,
-                vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
+                vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon,
+                nullptr, nullptr, vpx_d63_predictor_16x16_neon,
+                vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
-                vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr,
-                vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
+                vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon,
+                nullptr, nullptr, vpx_d63_predictor_32x32_neon,
+                vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 7f8e1c5b51..83e371df6e 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -251,6 +251,14 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d63_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d63_predictor_32x32_neon,
                        &vpx_d63_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d117_predictor_16x16_neon,
+                       &vpx_d117_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d117_predictor_32x32_neon,
+                       &vpx_d117_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
                        4, 8),
         IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 02a05aae53..629c7170c6 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -545,6 +545,258 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
+  col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
+
+  d0 = vrhadd_u8(az, a0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vext_u8(col0, d0, 7);
+  d3 = vext_u8(col1, d1, 7);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left + 0);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vext_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // d0[1] = AVG2(above[0], above[1])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhadd_u8(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end. The lowest two lanes here are unused:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[2] = AVG3(left[4], left[5], left[6])
+  // col0[1] = x (don't care)
+  // col0[0] = x (don't care)
+  col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzp_u8(col0, col0).val[1];
+  col0_odd = vuzp_u8(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+  vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
+  vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
+  vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
+  vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
+  vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
+}
+
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[15], x ]
+  l1 = vextq_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(az, a0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+
+  col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
+
+  // The low nine lanes here are unused so the first input to the uzp is
+  // unused, so just use a duplicate of col0 since we have it already. This
+  // also means that the lowest lane of col0 here is unused.
+  col0_even = vuzpq_u8(col0, col0).val[1];
+  col0_odd = vuzpq_u8(col0, col0).val[0];
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+  vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
+  vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
+  vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
+  vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
+  vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
+  vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
+  vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
+  vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
+  vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
+  vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
+  vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
+  vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
+}
+
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
+      l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  // The last lane here is unused, reading left[32] would cause a buffer
+  // over-read (observed as an address-sanitizer failure), so just fill with a
+  // duplicate of left[16] to avoid needing to materialize a zero:
+  // [ left[17], ... , left[31], x ]
+  l17 = vextq_u8(l16, l16, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(az, a0);
+  d0_hi = vrhaddq_u8(a15, a16);
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  // The last lane of col0_hi is unused here.
+  col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
+  col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
+
+  // The first lane of these are unused since they are only ever called as
+  // ext(col0, _, i) where i >= 1.
+  col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+}
+
+// -----------------------------------------------------------------------------
+
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x8_t XA0123 = vld1_u8(above - 1);
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 2301fbe328..652c553f97 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -57,6 +57,7 @@ ()
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
@@ -101,6 +102,7 @@ ()
 specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_8x8 neon/;
@@ -141,6 +143,7 @@ ()
 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_16x16 neon/;
@@ -179,6 +182,7 @@ ()
 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_32x32 neon/;

From 872476c66b31a56c5e54262a5e38481c8430f71c Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 14 Feb 2023 14:56:25 +0000
Subject: [PATCH 577/926] Implement d207_predictor using Neon

Add Neon implementations of the d207 predictor for 4x4, 8x8, 16x16 and
32x32 block sizes. Also update tests to add new corresponding cases.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    1.72
Neoverse N1 |  LLVM 15 |   8x8 |    5.68
Neoverse N1 |  LLVM 15 | 16x16 |   12.30
Neoverse N1 |  LLVM 15 | 32x32 |   16.70
Neoverse N1 |   GCC 12 |   4x4 |    1.71
Neoverse N1 |   GCC 12 |   8x8 |    6.01
Neoverse N1 |   GCC 12 | 16x16 |   12.40
Neoverse N1 |   GCC 12 | 32x32 |    6.71
Neoverse V1 |  LLVM 15 |   4x4 |    1.99
Neoverse V1 |  LLVM 15 |   8x8 |    8.28
Neoverse V1 |  LLVM 15 | 16x16 |   14.36
Neoverse V1 |  LLVM 15 | 32x32 |   17.55
Neoverse V1 |   GCC 12 |   4x4 |    1.99
Neoverse V1 |   GCC 12 |   8x8 |    8.43
Neoverse V1 |   GCC 12 | 16x16 |   14.41
Neoverse V1 |   GCC 12 | 32x32 |    7.82

Change-Id: I250ab56edab3390b0bac9dc96995a4bf9a4da641
---
 test/test_intra_pred_speed.cc |   8 +-
 test/vp9_intrapred_test.cc    |   8 ++
 vpx_dsp/arm/intrapred_neon.c  | 194 ++++++++++++++++++++++++++++++++++
 vpx_dsp/arm/mem_neon.h        |  13 +++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |   8 +-
 5 files changed, 223 insertions(+), 8 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 871f778116..5792161b34 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -270,14 +270,14 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
                 vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon,
-                vpx_d153_predictor_4x4_neon, nullptr,
+                vpx_d153_predictor_4x4_neon, vpx_d207_predictor_4x4_neon,
                 vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
                 vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon,
-                vpx_d153_predictor_8x8_neon, nullptr,
+                vpx_d153_predictor_8x8_neon, vpx_d207_predictor_8x8_neon,
                 vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
@@ -285,7 +285,7 @@ INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
                 vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon,
-                vpx_d153_predictor_16x16_neon, nullptr,
+                vpx_d153_predictor_16x16_neon, vpx_d207_predictor_16x16_neon,
                 vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
@@ -293,7 +293,7 @@ INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
                 vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon,
-                vpx_d153_predictor_32x32_neon, nullptr,
+                vpx_d153_predictor_32x32_neon, vpx_d207_predictor_32x32_neon,
                 vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index a2ea1334d8..8696c0a787 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -275,6 +275,14 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d153_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d153_predictor_32x32_neon,
                        &vpx_d153_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d207_predictor_4x4_neon, &vpx_d207_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d207_predictor_8x8_neon, &vpx_d207_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d207_predictor_16x16_neon,
+                       &vpx_d207_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d207_predictor_32x32_neon,
+                       &vpx_d207_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
                        &vpx_dc_128_predictor_4x4_c, 4, 8),
         IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 1ff4bf2955..892310f151 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -1335,6 +1335,200 @@ void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1;
+  (void)above;
+
+  // We need the low half lanes here for the c0/c1 arithmetic but the high half
+  // lanes for the ext:
+  // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ]
+  l0 = load_replicate_u8_4x1(left + 0);
+  l3 = vld1_dup_u8(left + 3);
+
+  // [ left[1], left[2], left[3], left[3], x, x, x, x ]
+  l1 = vext_u8(l0, l3, 5);
+  // [ left[2], left[3], left[3], left[3], x, x, x, x ]
+  l2 = vext_u8(l0, l3, 6);
+
+  c0 = vrhadd_u8(l0, l1);
+  c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+  // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ]
+  c01 = vzip_u8(c0, c1).val[0];
+
+  d0 = c01;
+  d1 = vext_u8(c01, l3, 2);
+
+  // Store the high half of the vector for stride={2,3} to avoid needing
+  // additional ext instructions:
+  // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
+  // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
+  // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
+  // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1_high(dst + 2 * stride, d0);
+  store_u8_4x1_high(dst + 3 * stride, d1);
+}
+
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+  (void)above;
+
+  l0 = vld1_u8(left + 0);
+  l7 = vld1_dup_u8(left + 7);
+
+  // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+  l1 = vext_u8(l0, l7, 1);
+  // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+  l2 = vext_u8(l0, l7, 2);
+
+  c0 = vrhadd_u8(l0, l1);
+  c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+  c01_lo = vzip_u8(c0, c1).val[0];
+  c01_hi = vzip_u8(c0, c1).val[1];
+
+  vst1_u8(dst + 0 * stride, c01_lo);
+  vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2));
+  vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4));
+  vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6));
+  vst1_u8(dst + 4 * stride, c01_hi);
+  vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2));
+  vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4));
+  vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6));
+}
+
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+  (void)above;
+
+  l0 = vld1q_u8(left + 0);
+  l15 = vld1q_dup_u8(left + 15);
+
+  l1 = vextq_u8(l0, l15, 1);
+  l2 = vextq_u8(l0, l15, 2);
+
+  c0 = vrhaddq_u8(l0, l1);
+  c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1);
+
+  c01_lo = vzipq_u8(c0, c1).val[0];
+  c01_hi = vzipq_u8(c0, c1).val[1];
+
+  vst1q_u8(dst + 0 * stride, c01_lo);
+  vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2));
+  vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4));
+  vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6));
+  vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8));
+  vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10));
+  vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12));
+  vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14));
+  vst1q_u8(dst + 8 * stride, c01_hi);
+  vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2));
+  vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4));
+  vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6));
+  vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8));
+  vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12));
+  vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14));
+}
+
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo,
+      c1_hi, c01[4];
+  (void)above;
+
+  l0_lo = vld1q_u8(left + 0);
+  l0_hi = vld1q_u8(left + 16);
+  l31 = vld1q_dup_u8(left + 31);
+
+  l1_lo = vextq_u8(l0_lo, l0_hi, 1);
+  l1_hi = vextq_u8(l0_hi, l31, 1);
+  l2_lo = vextq_u8(l0_lo, l0_hi, 2);
+  l2_hi = vextq_u8(l0_hi, l31, 2);
+
+  c0_lo = vrhaddq_u8(l0_lo, l1_lo);
+  c0_hi = vrhaddq_u8(l0_hi, l1_hi);
+  c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo);
+  c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi);
+
+  c01[0] = vzipq_u8(c0_lo, c1_lo).val[0];
+  c01[1] = vzipq_u8(c0_lo, c1_lo).val[1];
+  c01[2] = vzipq_u8(c0_hi, c1_hi).val[0];
+  c01[3] = vzipq_u8(c0_hi, c1_hi).val[1];
+
+  vst1q_u8(dst + 0 * stride + 0, c01[0]);
+  vst1q_u8(dst + 0 * stride + 16, c01[1]);
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14));
+  vst1q_u8(dst + 8 * stride + 0, c01[1]);
+  vst1q_u8(dst + 8 * stride + 16, c01[2]);
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14));
+  vst1q_u8(dst + 16 * stride + 0, c01[2]);
+  vst1q_u8(dst + 16 * stride + 16, c01[3]);
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14));
+  vst1q_u8(dst + 24 * stride + 0, c01[3]);
+  vst1q_u8(dst + 24 * stride + 16, l31);
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2));
+  vst1q_u8(dst + 25 * stride + 16, l31);
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4));
+  vst1q_u8(dst + 26 * stride + 16, l31);
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6));
+  vst1q_u8(dst + 27 * stride + 16, l31);
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8));
+  vst1q_u8(dst + 28 * stride + 16, l31);
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10));
+  vst1q_u8(dst + 29 * stride + 16, l31);
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12));
+  vst1q_u8(dst + 30 * stride + 16, l31);
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14));
+  vst1q_u8(dst + 31 * stride + 16, l31);
+}
+
+// -----------------------------------------------------------------------------
+
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index fb8ab17780..400846b707 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -132,11 +132,24 @@ static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
   return vreinterpret_u8_u32(a_u32);
 }
 
+// Load 4 contiguous bytes and replicate across a vector when alignment is not
+// guaranteed.
+static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  memcpy(&a, buf, 4);
+  return vreinterpret_u8_u32(vdup_n_u32(a));
+}
+
 // Store 4 contiguous bytes from the low half of an 8x8 vector.
 static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
   vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
 }
 
+// Store 4 contiguous bytes from the high half of an 8x8 vector.
+static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
+  vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
+}
+
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
                                           ptrdiff_t stride) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 1423de2689..9c15912ca2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -38,7 +38,7 @@ ()
 #
 
 add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4 sse2/;
+specialize qw/vpx_d207_predictor_4x4 neon sse2/;
 
 add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_4x4 neon sse2/;
@@ -87,7 +87,7 @@ ()
 specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_8x8 ssse3/;
+specialize qw/vpx_d207_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
@@ -131,7 +131,7 @@ ()
 specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_16x16 ssse3/;
+specialize qw/vpx_d207_predictor_16x16 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
@@ -170,7 +170,7 @@ ()
 specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_32x32 ssse3/;
+specialize qw/vpx_d207_predictor_32x32 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;

From 33f3ae34144ea42bbf97d812ef23dccfc4bb8662 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Mon, 6 Mar 2023 13:24:47 +0000
Subject: [PATCH 578/926] Fix potential buffer over-read in highbd d117
 predictor Neon

The load of `left[bs]` in the standard bitdepth d117 Neon implementation
triggered an address-sanitizer failure.

The highbd equivalent does not appear to trigger any asan failures when
running the VP9/ExternalFrameBufferMD5Test or
VP9/TestVectorTest.MD5Match tests, but for consistency with the standard
bitdepth implementation we adjust it to avoid the over-read.

Performance is roughly identical, with a 0.8% performance improvement on
average over the previous optimised code.

Change-Id: I05dc4d43f244f4915c0ccc52cc0af999bbacb018
---
 vpx_dsp/arm/highbd_intrapred_neon.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index d1e335c263..dc1b27dc10 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -465,7 +465,11 @@ void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
   l0az = vext_u16(vld1_dup_u16(left), az, 3);
 
   l0 = vld1_u16(left + 0);
-  l1 = vld1_u16(left + 1);
+  // The last lane here is unused, reading left[4] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], left[2], left[3], x ]
+  l1 = vext_u16(l0, l0, 1);
   // [ above[-1], left[0], left[1], left[2] ]
   azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
 
@@ -494,7 +498,11 @@ void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
   l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
 
   l0 = vld1q_u16(left + 0);
-  l1 = vld1q_u16(left + 1);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vextq_u16(l0, l0, 1);
   // [ above[-1], left[0], ..., left[6] ]
   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
 
@@ -565,7 +573,11 @@ void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
   l1 = vld1q_u16(left + 1);
   l7 = vld1q_u16(left + 7);
   l8 = vld1q_u16(left + 8);
-  l9 = vld1q_u16(left + 9);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[8] to avoid needing to
+  // materialize a zero:
+  // [ left[9], ... , left[15], x ]
+  l9 = vextq_u16(l8, l8, 1);
   // [ above[-1], left[0], ..., left[6] ]
   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
 
@@ -658,6 +670,11 @@ void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
   l23 = vld1q_u16(left + 23);
   l24 = vld1q_u16(left + 24);
   l25 = vld1q_u16(left + 25);
+  // The last lane here is unused, reading left[32] could cause a buffer
+  // over-read, so just fill with a duplicate of left[24] to avoid needing to
+  // materialize a zero:
+  // [ left[25], ... , left[31], x ]
+  l25 = vextq_u16(l24, l24, 1);
   // [ above[-1], left[0], ..., left[6] ]
   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
 

From cf85ae9a49e5c0dfa71322275e99ac7accf0acaf Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Wed, 22 Feb 2023 15:33:37 +0000
Subject: [PATCH 579/926] Implement highbd_d153_predictor using Neon

Add Neon implementations of the highbd d153 predictor for 4x4, 8x8,
16x16 and 32x32 block sizes. Also update tests to add new corresponding
cases.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    1.71
Neoverse N1 |  LLVM 15 |   8x8 |    4.05
Neoverse N1 |  LLVM 15 | 16x16 |    7.04
Neoverse N1 |  LLVM 15 | 32x32 |    7.71
Neoverse N1 |   GCC 12 |   4x4 |    1.84
Neoverse N1 |   GCC 12 |   8x8 |    4.19
Neoverse N1 |   GCC 12 | 16x16 |    6.07
Neoverse N1 |   GCC 12 | 32x32 |    3.14
Neoverse V1 |  LLVM 15 |   4x4 |    3.19
Neoverse V1 |  LLVM 15 |   8x8 |    5.51
Neoverse V1 |  LLVM 15 | 16x16 |    7.73
Neoverse V1 |  LLVM 15 | 32x32 |    7.72
Neoverse V1 |   GCC 12 |   4x4 |    3.97
Neoverse V1 |   GCC 12 |   8x8 |    5.52
Neoverse V1 |   GCC 12 | 16x16 |    6.31
Neoverse V1 |   GCC 12 | 32x32 |    5.36

Change-Id: I2bce6f1921d76d1c10d163e0cd4f395b40799184
---
 test/test_intra_pred_speed.cc       |  16 +-
 test/vp9_intrapred_test.cc          |  24 ++
 vpx_dsp/arm/highbd_intrapred_neon.c | 400 ++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl        |   8 +-
 4 files changed, 436 insertions(+), 12 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 5792161b34..e334027ddc 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -566,16 +566,16 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
     vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon,
-    nullptr, nullptr, vpx_highbd_d63_predictor_4x4_neon,
-    vpx_highbd_tm_predictor_4x4_neon)
+    vpx_highbd_d153_predictor_4x4_neon, nullptr,
+    vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
     vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon,
-    nullptr, nullptr, vpx_highbd_d63_predictor_8x8_neon,
-    vpx_highbd_tm_predictor_8x8_neon)
+    vpx_highbd_d153_predictor_8x8_neon, nullptr,
+    vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
     vpx_highbd_dc_left_predictor_16x16_neon,
@@ -583,8 +583,8 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
     vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
     vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon,
-    nullptr, nullptr, vpx_highbd_d63_predictor_16x16_neon,
-    vpx_highbd_tm_predictor_16x16_neon)
+    vpx_highbd_d153_predictor_16x16_neon, nullptr,
+    vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
     vpx_highbd_dc_left_predictor_32x32_neon,
@@ -592,8 +592,8 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
     vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
     vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon,
-    nullptr, nullptr, vpx_highbd_d63_predictor_32x32_neon,
-    vpx_highbd_tm_predictor_32x32_neon)
+    vpx_highbd_d153_predictor_32x32_neon, nullptr,
+    vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 8696c0a787..d8ccd2db69 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -888,6 +888,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -980,6 +988,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -1072,6 +1088,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 6b6ad95c12..4faecb575c 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -1178,6 +1178,406 @@ void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi;
+  (void)bd;
+
+  az = vld1_u16(above - 1);
+  a0 = vld1_u16(above + 0);
+  // [ left[0], above[-1], above[0], above[1] ]
+  l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+  l0 = vld1_u16(left);
+  // The last lane here is unused, reading left[4] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], left[2], left[3], x ]
+  l1 = vext_u16(l0, l0, 1);
+  // [ above[-1], left[0], left[1], left[2] ]
+  azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+  d0 = vrhadd_u16(azl0, l0);
+  d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+  d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+
+  d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0];
+  d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1] ]
+  vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3));
+  vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1));
+  vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3));
+  vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo,
+      d20_hi;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vextq_u16(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], left[0])
+  // d0[1] = AVG2(left[0], left[1])
+  // ...
+  // d0[7] = AVG2(left[6], left[7])
+  d0 = vrhaddq_u16(azl0, l0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+  // d2[0] = AVG3(above[-1], left[0], left[1])
+  // d2[1] = AVG3(left[0], left[1], left[2])
+  // ...
+  // d2[7] = AVG3(left[6], left[7], left[8])
+  d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vectors to put the elements to be shifted
+  // in at the end:
+  d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4));
+  d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4));
+
+  d20_lo = vzipq_u16(d2_rev, d0_rev).val[0];
+  d20_hi = vzipq_u16(d2_rev, d0_rev).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+  // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+  // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+  // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+  // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+  vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7));
+  vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5));
+  vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2],
+      d2[2], d20[4];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[8] to avoid needing to
+  // materialize a zero:
+  // [ left[9], ... , left[15], x ]
+  l9 = vextq_u16(l8, l8, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(azl0, l0);
+  d0[1] = vrhaddq_u16(l7, l8);
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+  d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+  d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+  d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+  d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+
+  d20[0] = vzipq_u16(d2[1], d0[1]).val[0];
+  d20[1] = vzipq_u16(d2[1], d0[1]).val[1];
+  d20[2] = vzipq_u16(d2[0], d0[0]).val[0];
+  d20[3] = vzipq_u16(d2[0], d0[0]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+}
+
+void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+      l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a14 = vld1q_u16(above + 14);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a22 = vld1q_u16(above + 22);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l15 = vld1q_u16(left + 15);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l23 = vld1q_u16(left + 23);
+  l24 = vld1q_u16(left + 24);
+  // The last lane here is unused, reading left[32] could cause a buffer
+  // over-read, so just fill with a duplicate of left[24] to avoid needing to
+  // materialize a zero:
+  // [ left[25], ... , left[31], x ]
+  l25 = vextq_u16(l24, l24, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(azl0, l0);
+  d0[1] = vrhaddq_u16(l7, l8);
+  d0[2] = vrhaddq_u16(l15, l16);
+  d0[3] = vrhaddq_u16(l23, l24);
+
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+  d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+  d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+  d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+  d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+  d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+  d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4));
+  d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4));
+  d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+  d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+  d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4));
+  d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4));
+
+  d20[0] = vzipq_u16(d2[3], d0[3]).val[0];
+  d20[1] = vzipq_u16(d2[3], d0[3]).val[1];
+  d20[2] = vzipq_u16(d2[2], d0[2]).val[0];
+  d20[3] = vzipq_u16(d2[2], d0[2]).val[1];
+  d20[4] = vzipq_u16(d2[1], d0[1]).val[0];
+  d20[5] = vzipq_u16(d2[1], d0[1]).val[1];
+  d20[6] = vzipq_u16(d2[0], d0[0]).val[0];
+  d20[7] = vzipq_u16(d2[0], d0[0]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1));
+
+  vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1));
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1));
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1));
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1));
+}
+
+// -----------------------------------------------------------------------------
+
 void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 9c15912ca2..b3f655c2bd 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -229,7 +229,7 @@ ()
   specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d153_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
@@ -268,7 +268,7 @@ ()
   specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
@@ -307,7 +307,7 @@ ()
   specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
@@ -346,7 +346,7 @@ ()
   specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;

From 9e35c35945ff95d325d1617c8ebcbe80245ae651 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 23 Feb 2023 16:25:38 +0000
Subject: [PATCH 580/926] Implement highbd_d207_predictor using Neon

Add Neon implementations of the highbd d207 predictor for 4x4, 8x8,
16x16 and 32x32 block sizes. Also update tests to add new corresponding
cases.

Speedups over the C code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 |   4x4 |    1.61
Neoverse N1 |  LLVM 15 |   8x8 |    5.30
Neoverse N1 |  LLVM 15 | 16x16 |    8.93
Neoverse N1 |  LLVM 15 | 32x32 |    8.35
Neoverse N1 |   GCC 12 |   4x4 |    2.16
Neoverse N1 |   GCC 12 |   8x8 |    5.75
Neoverse N1 |   GCC 12 | 16x16 |    7.28
Neoverse N1 |   GCC 12 | 32x32 |    3.31
Neoverse V1 |  LLVM 15 |   4x4 |    1.71
Neoverse V1 |  LLVM 15 |   8x8 |    7.46
Neoverse V1 |  LLVM 15 | 16x16 |   10.09
Neoverse V1 |  LLVM 15 | 32x32 |    8.10
Neoverse V1 |   GCC 12 |   4x4 |    1.99
Neoverse V1 |   GCC 12 |   8x8 |    7.81
Neoverse V1 |   GCC 12 | 16x16 |    8.34
Neoverse V1 |   GCC 12 | 32x32 |    5.74

Change-Id: Ic021e82eed0c7bc8263eb68606411354eb5e4870
---
 test/test_intra_pred_speed.cc       |   8 +-
 test/vp9_intrapred_test.cc          |  24 +++
 vpx_dsp/arm/highbd_intrapred_neon.c | 305 ++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl        |   8 +-
 4 files changed, 337 insertions(+), 8 deletions(-)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index e334027ddc..15303816b9 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -566,7 +566,7 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
     vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon,
-    vpx_highbd_d153_predictor_4x4_neon, nullptr,
+    vpx_highbd_d153_predictor_4x4_neon, vpx_highbd_d207_predictor_4x4_neon,
     vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
@@ -574,7 +574,7 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
     vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon,
-    vpx_highbd_d153_predictor_8x8_neon, nullptr,
+    vpx_highbd_d153_predictor_8x8_neon, vpx_highbd_d207_predictor_8x8_neon,
     vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
@@ -583,7 +583,7 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
     vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
     vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon,
-    vpx_highbd_d153_predictor_16x16_neon, nullptr,
+    vpx_highbd_d153_predictor_16x16_neon, vpx_highbd_d207_predictor_16x16_neon,
     vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
@@ -592,7 +592,7 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
     vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
     vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon,
-    vpx_highbd_d153_predictor_32x32_neon, nullptr,
+    vpx_highbd_d153_predictor_32x32_neon, vpx_highbd_d207_predictor_32x32_neon,
     vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index d8ccd2db69..cec9031618 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -896,6 +896,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d153_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
                              &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -996,6 +1004,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d153_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
                              &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -1096,6 +1112,14 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d153_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
                              &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 4faecb575c..503900915d 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -1821,6 +1821,311 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 //------------------------------------------------------------------------------
 
+void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi;
+  (void)above;
+  (void)bd;
+
+  l0 = vld1_u16(left + 0);
+  l3 = vld1_dup_u16(left + 3);
+
+  // [ left[1], left[2], left[3], left[3] ]
+  l1 = vext_u16(l0, l3, 1);
+  // [ left[2], left[3], left[3], left[3] ]
+  l2 = vext_u16(l0, l3, 2);
+
+  c0 = vrhadd_u16(l0, l1);
+  c1 = vrhadd_u16(vhadd_u16(l0, l2), l1);
+
+  c01_lo = vzip_u16(c0, c1).val[0];
+  c01_hi = vzip_u16(c0, c1).val[1];
+
+  // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
+  // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
+  // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
+  // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+  vst1_u16(dst + 0 * stride, c01_lo);
+  vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2));
+  vst1_u16(dst + 2 * stride, c01_hi);
+  vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2));
+}
+
+void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi;
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l7 = vld1q_dup_u16(left + 7);
+
+  // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+  l1 = vextq_u16(l0, l7, 1);
+  // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+  l2 = vextq_u16(l0, l7, 2);
+
+  c0 = vrhaddq_u16(l0, l1);
+  c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+
+  c01_lo = vzipq_u16(c0, c1).val[0];
+  c01_hi = vzipq_u16(c0, c1).val[1];
+
+  vst1q_u16(dst + 0 * stride, c01_lo);
+  vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2));
+  vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4));
+  vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6));
+  vst1q_u16(dst + 4 * stride, c01_hi);
+  vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2));
+  vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4));
+  vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6));
+}
+
+void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4];
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l2 = vld1q_u16(left + 2);
+  l8 = vld1q_u16(left + 8);
+  l15 = vld1q_dup_u16(left + 15);
+
+  l9 = vextq_u16(l8, l15, 1);
+  l10 = vextq_u16(l8, l15, 2);
+
+  c0[0] = vrhaddq_u16(l0, l1);
+  c0[1] = vrhaddq_u16(l8, l9);
+  c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+  c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+
+  c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+  c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+  c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+  c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, c01[0]);
+  vst1q_u16(dst + 0 * stride + 8, c01[1]);
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+
+  vst1q_u16(dst + 4 * stride + 0, c01[1]);
+  vst1q_u16(dst + 4 * stride + 8, c01[2]);
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+
+  vst1q_u16(dst + 8 * stride + 0, c01[2]);
+  vst1q_u16(dst + 8 * stride + 8, c01[3]);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6));
+
+  vst1q_u16(dst + 12 * stride + 0, c01[3]);
+  vst1q_u16(dst + 12 * stride + 8, l15);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2));
+  vst1q_u16(dst + 13 * stride + 8, l15);
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4));
+  vst1q_u16(dst + 14 * stride + 8, l15);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6));
+  vst1q_u16(dst + 15 * stride + 8, l15);
+}
+
+void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4],
+      c1[4], c01[8];
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l2 = vld1q_u16(left + 2);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l10 = vld1q_u16(left + 10);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l18 = vld1q_u16(left + 18);
+  l24 = vld1q_u16(left + 24);
+  l31 = vld1q_dup_u16(left + 31);
+
+  l25 = vextq_u16(l24, l31, 1);
+  l26 = vextq_u16(l24, l31, 2);
+
+  c0[0] = vrhaddq_u16(l0, l1);
+  c0[1] = vrhaddq_u16(l8, l9);
+  c0[2] = vrhaddq_u16(l16, l17);
+  c0[3] = vrhaddq_u16(l24, l25);
+  c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+  c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+  c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17);
+  c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25);
+
+  c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+  c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+  c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+  c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+  c01[4] = vzipq_u16(c0[2], c1[2]).val[0];
+  c01[5] = vzipq_u16(c0[2], c1[2]).val[1];
+  c01[6] = vzipq_u16(c0[3], c1[3]).val[0];
+  c01[7] = vzipq_u16(c0[3], c1[3]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, c01[0]);
+  vst1q_u16(dst + 0 * stride + 8, c01[1]);
+  vst1q_u16(dst + 0 * stride + 16, c01[2]);
+  vst1q_u16(dst + 0 * stride + 24, c01[3]);
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6));
+
+  vst1q_u16(dst + 4 * stride + 0, c01[1]);
+  vst1q_u16(dst + 4 * stride + 8, c01[2]);
+  vst1q_u16(dst + 4 * stride + 16, c01[3]);
+  vst1q_u16(dst + 4 * stride + 24, c01[4]);
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6));
+
+  vst1q_u16(dst + 8 * stride + 0, c01[2]);
+  vst1q_u16(dst + 8 * stride + 8, c01[3]);
+  vst1q_u16(dst + 8 * stride + 16, c01[4]);
+  vst1q_u16(dst + 8 * stride + 24, c01[5]);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6));
+
+  vst1q_u16(dst + 12 * stride + 0, c01[3]);
+  vst1q_u16(dst + 12 * stride + 8, c01[4]);
+  vst1q_u16(dst + 12 * stride + 16, c01[5]);
+  vst1q_u16(dst + 12 * stride + 24, c01[6]);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6));
+
+  vst1q_u16(dst + 16 * stride + 0, c01[4]);
+  vst1q_u16(dst + 16 * stride + 8, c01[5]);
+  vst1q_u16(dst + 16 * stride + 16, c01[6]);
+  vst1q_u16(dst + 16 * stride + 24, c01[7]);
+  vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6));
+
+  vst1q_u16(dst + 20 * stride + 0, c01[5]);
+  vst1q_u16(dst + 20 * stride + 8, c01[6]);
+  vst1q_u16(dst + 20 * stride + 16, c01[7]);
+  vst1q_u16(dst + 20 * stride + 24, l31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6));
+
+  vst1q_u16(dst + 24 * stride + 0, c01[6]);
+  vst1q_u16(dst + 24 * stride + 8, c01[7]);
+  vst1q_u16(dst + 24 * stride + 16, l31);
+  vst1q_u16(dst + 24 * stride + 24, l31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6));
+
+  vst1q_u16(dst + 28 * stride + 0, c01[7]);
+  vst1q_u16(dst + 28 * stride + 8, l31);
+  vst1q_u16(dst + 28 * stride + 16, l31);
+  vst1q_u16(dst + 28 * stride + 24, l31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6));
+}
+
+//------------------------------------------------------------------------------
+
 void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *above,
                                      const uint16_t *left, int bd) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b3f655c2bd..80dc6d95c9 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -211,7 +211,7 @@ ()
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d207_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
@@ -250,7 +250,7 @@ ()
   specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
@@ -289,7 +289,7 @@ ()
   specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
@@ -328,7 +328,7 @@ ()
   specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;

From 6b783c6975a5fc2ee21579cc3c48e59184bf3295 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 6 Mar 2023 17:52:13 +0000
Subject: [PATCH 581/926] Optimize vpx_sum_squares_2d_i16_neon

Add an additional 32-bit vector accumulator to allow parallel
processing on CPUs that have more than one Neon multiply-accumulate
pipeline. Also use sum_neon.h horizontal-add helpers for reduction.

Change-Id: Ibcb48a738f5dee1430c3ebcd305b5ea8ea344c40
---
 vpx_dsp/arm/sum_neon.h         |   8 +++
 vpx_dsp/arm/sum_squares_neon.c | 117 +++++++++++++++++++--------------
 2 files changed, 74 insertions(+), 51 deletions(-)

diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 47748a8061..6f513ca7a8 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -127,4 +127,12 @@ static INLINE uint64_t horizontal_add_int64x2(const int64x2_t a) {
 #endif
 }
 
+static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u64(a);
+#else
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/vpx_dsp/arm/sum_squares_neon.c b/vpx_dsp/arm/sum_squares_neon.c
index cfefad9938..074afe3258 100644
--- a/vpx_dsp/arm/sum_squares_neon.c
+++ b/vpx_dsp/arm/sum_squares_neon.c
@@ -9,77 +9,92 @@
  */
 
 #include <arm_neon.h>
-
 #include <assert.h>
+
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
 uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
-  uint64x1_t s2;
-
   if (size == 4) {
     int16x4_t s[4];
-    int32x4_t s0;
-    uint32x2_t s1;
+    int32x4_t sum_s32;
 
     s[0] = vld1_s16(src + 0 * stride);
     s[1] = vld1_s16(src + 1 * stride);
     s[2] = vld1_s16(src + 2 * stride);
     s[3] = vld1_s16(src + 3 * stride);
-    s0 = vmull_s16(s[0], s[0]);
-    s0 = vmlal_s16(s0, s[1], s[1]);
-    s0 = vmlal_s16(s0, s[2], s[2]);
-    s0 = vmlal_s16(s0, s[3], s[3]);
-    s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)),
-                   vget_high_u32(vreinterpretq_u32_s32(s0)));
-    s2 = vpaddl_u32(s1);
+
+    sum_s32 = vmull_s16(s[0], s[0]);
+    sum_s32 = vmlal_s16(sum_s32, s[1], s[1]);
+    sum_s32 = vmlal_s16(sum_s32, s[2], s[2]);
+    sum_s32 = vmlal_s16(sum_s32, s[3], s[3]);
+
+    return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32));
   } else {
-    int r = size;
-    uint64x2_t s1 = vdupq_n_u64(0);
+    uint64x2_t sum_u64 = vdupq_n_u64(0);
+    int rows = size;
 
     do {
-      int c = size;
-      int32x4_t s0 = vdupq_n_s32(0);
-      const int16_t *src_t = src;
+      const int16_t *src_ptr = src;
+      int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+      int cols = size;
 
       do {
         int16x8_t s[8];
 
-        s[0] = vld1q_s16(src_t + 0 * stride);
-        s[1] = vld1q_s16(src_t + 1 * stride);
-        s[2] = vld1q_s16(src_t + 2 * stride);
-        s[3] = vld1q_s16(src_t + 3 * stride);
-        s[4] = vld1q_s16(src_t + 4 * stride);
-        s[5] = vld1q_s16(src_t + 5 * stride);
-        s[6] = vld1q_s16(src_t + 6 * stride);
-        s[7] = vld1q_s16(src_t + 7 * stride);
-        s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7]));
-        src_t += 8;
-        c -= 8;
-      } while (c);
+        s[0] = vld1q_s16(src_ptr + 0 * stride);
+        s[1] = vld1q_s16(src_ptr + 1 * stride);
+        s[2] = vld1q_s16(src_ptr + 2 * stride);
+        s[3] = vld1q_s16(src_ptr + 3 * stride);
+        s[4] = vld1q_s16(src_ptr + 4 * stride);
+        s[5] = vld1q_s16(src_ptr + 5 * stride);
+        s[6] = vld1q_s16(src_ptr + 6 * stride);
+        s[7] = vld1q_s16(src_ptr + 7 * stride);
 
-      s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0)));
-      s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0)));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7]));
+
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7]));
+
+        src_ptr += 8;
+        cols -= 8;
+      } while (cols);
+
+      sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0]));
+      sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1]));
       src += 8 * stride;
-      r -= 8;
-    } while (r);
+      rows -= 8;
+    } while (rows);
 
-    s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1));
+    return horizontal_add_uint64x2(sum_u64);
   }
-
-  return vget_lane_u64(s2, 0);
 }

From d94e16404a08f3a67aa570d6b8c107ae47e158b5 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 6 Mar 2023 13:56:17 -0800
Subject: [PATCH 582/926] vpx_convolve_copy_neon: fix unaligned loads w/w==4

Fixes a -fsanitize=undefined warning:

vpx_dsp/arm/vpx_convolve_copy_neon.c:29:26: runtime error: load of
misaligned address 0xffffa8242bea for type 'const uint32_t' (aka 'const
unsigned int'), which requires 4 byte alignment
0xffffa8242bea: note: pointer points here
 88 81  7d 7d 7d 7d 7d 81 81 7d  81 80 87 97 a8 ab a0 91 ...
              ^
    #0 0xb0447c in vpx_convolve_copy_neon
       vpx_dsp/arm/vpx_convolve_copy_neon.c:29:26
    #1 0x12285c8 in inter_predictor vp9/common/vp9_reconinter.h:29:3
    #2 0x1228430 in dec_build_inter_predictors
       vp9/decoder/vp9_decodeframe.c
    ...

Change-Id: Iaec4ac2a400b6e6db72d12e5a7acb316262b12a7
---
 vpx_dsp/arm/vpx_convolve_copy_neon.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c
index 361ec8a806..bea7c98437 100644
--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -9,6 +9,7 @@
  */
 
 #include <arm_neon.h>
+#include <string.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
@@ -26,10 +27,10 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   if (w < 8) {  // copy4
     do {
-      *(uint32_t *)dst = *(const uint32_t *)src;
+      memcpy(dst, src, 4);
       src += src_stride;
       dst += dst_stride;
-      *(uint32_t *)dst = *(const uint32_t *)src;
+      memcpy(dst, src, 4);
       src += src_stride;
       dst += dst_stride;
       h -= 2;

From 5a2bb12c52171ee8ef86f9a1129ac413204ea3cf Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Fri, 3 Mar 2023 23:42:50 +0000
Subject: [PATCH 583/926] Fix heap buffer overrun in vpx_get4x4sse_cs_neon

Use a mem_neon.h helper to do strided 4-byte loads instead of Neon
8-byte loads - where the last 4 bytes are out of bounds.

Re-enable the Neon code path and the tests.

Bug: webm:1794
Change-Id: I69ccff730f4a5cbf585dd6a9aa0f3eb13e150074
---
 test/variance_test.cc        |  3 --
 vpx_dsp/arm/variance_neon.c  | 99 ++++++++++--------------------------
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  4 +-
 3 files changed, 29 insertions(+), 77 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 237d595bb7..1359bc4baf 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1446,12 +1446,9 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed.
-#if 0
 INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest,
                          ::testing::Values(SseParams(2, 2,
                                                      &vpx_get4x4sse_cs_neon)));
-#endif
 
 INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest,
                          ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon),
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 76c2a15863..69ff1cf153 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -433,42 +433,18 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
   return *sse;
 }
 
-// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed.
-#if 0
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
                                    int ref_stride) {
-  uint8x8_t a[4], b[4], abs_diff[4];
-  uint32x2_t sse = vdup_n_u32(0);
-
-  a[0] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[0] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[1] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[1] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[2] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[2] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[3] = vld1_u8(src_ptr);
-  b[3] = vld1_u8(ref_ptr);
-
-  abs_diff[0] = vabd_u8(a[0], b[0]);
-  abs_diff[1] = vabd_u8(a[1], b[1]);
-  abs_diff[2] = vabd_u8(a[2], b[2]);
-  abs_diff[3] = vabd_u8(a[3], b[3]);
-
-  sse = vdot_u32(sse, abs_diff[0], abs_diff[0]);
-  sse = vdot_u32(sse, abs_diff[1], abs_diff[1]);
-  sse = vdot_u32(sse, abs_diff[2], abs_diff[2]);
-  sse = vdot_u32(sse, abs_diff[3], abs_diff[3]);
-
-  return vget_lane_u32(sse, 0);
+  uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+  uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+  return horizontal_add_uint32x4(sse);
 }
-#endif  // 0
 
 #else  // !defined(__ARM_FEATURE_DOTPROD)
 
@@ -535,49 +511,30 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
   return *sse;
 }
 
-// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed.
-#if 0
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
                                    int ref_stride) {
-  uint8x8_t a[4], b[4];
-  int16x4_t diff_lo[4];
-  uint16x8_t diff[4];
-  int32x4_t sse;
-
-  a[0] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[0] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[1] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[1] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[2] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[2] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[3] = vld1_u8(src_ptr);
-  b[3] = vld1_u8(ref_ptr);
-
-  diff[0] = vsubl_u8(a[0], b[0]);
-  diff[1] = vsubl_u8(a[1], b[1]);
-  diff[2] = vsubl_u8(a[2], b[2]);
-  diff[3] = vsubl_u8(a[3], b[3]);
-
-  diff_lo[0] = vget_low_s16(vreinterpretq_s16_u16(diff[0]));
-  diff_lo[1] = vget_low_s16(vreinterpretq_s16_u16(diff[1]));
-  diff_lo[2] = vget_low_s16(vreinterpretq_s16_u16(diff[2]));
-  diff_lo[3] = vget_low_s16(vreinterpretq_s16_u16(diff[3]));
-
-  sse = vmull_s16(diff_lo[0], diff_lo[0]);
-  sse = vmlal_s16(sse, diff_lo[1], diff_lo[1]);
-  sse = vmlal_s16(sse, diff_lo[2], diff_lo[2]);
-  sse = vmlal_s16(sse, diff_lo[3], diff_lo[3]);
-
-  return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
+  uint8x8_t s[2], r[2];
+  uint16x8_t abs_diff[2];
+  uint32x4_t sse;
+
+  s[0] = load_u8(src_ptr, src_stride);
+  r[0] = load_u8(ref_ptr, ref_stride);
+  src_ptr += 2 * src_stride;
+  ref_ptr += 2 * ref_stride;
+  s[1] = load_u8(src_ptr, src_stride);
+  r[1] = load_u8(ref_ptr, ref_stride);
+
+  abs_diff[0] = vabdl_u8(s[0], r[0]);
+  abs_diff[1] = vabdl_u8(s[1], r[1]);
+
+  sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0]));
+  sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0]));
+  sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1]));
+  sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1]));
+
+  return horizontal_add_uint32x4(sse);
 }
-#endif  // 0
 
 #endif  // defined(__ARM_FEATURE_DOTPROD)
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 2301fbe328..c50ab93c5a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1152,10 +1152,8 @@ ()
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
 
-  # TODO(https://crbug.com/webm/1794): enable neon after heap overflow is
-  # fixed.
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride";
-  specialize qw/vpx_get4x4sse_cs msa vsx/;
+  specialize qw/vpx_get4x4sse_cs neon msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
   specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;

From f2210fd29047de236bd8a3f349db10836ef283ef Mon Sep 17 00:00:00 2001
From: Neeraj Gadgil <neeraj.gadgil@ittiam.com>
Date: Wed, 1 Mar 2023 15:35:57 +0530
Subject: [PATCH 584/926] Early terminate interp filt search based on best RD
 cost

The CL prunes interpolation filter search based on rdcost of
individual planes.

                 Instruction Count        BD-Rate Loss(%)
cpu   Resolution   Reduction(%)    avg.psnr   ovr.psnr    ssim
 0       LOWRES2      1.613         0.0143     0.0208    0.0146
 0       MIDRES2      1.637         0.0214    -0.0316    0.0036
 0        HDRES2      1.369         0.0171     0.0178    0.1222
 0       Average      1.539         0.0176     0.0023    0.0468

STATS_CHANGED

Change-Id: I4be30bd1c7bbbc93c6bbc840565893a97d2598a4
---
 vp9/encoder/vp9_rd.c             | 40 --------------
 vp9/encoder/vp9_rd.h             |  5 --
 vp9/encoder/vp9_rdopt.c          | 91 +++++++++++++++++---------------
 vp9/encoder/vp9_speed_features.c |  2 +
 vp9/encoder/vp9_speed_features.h |  4 ++
 5 files changed, 54 insertions(+), 88 deletions(-)

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 58dd75b441..95c95971c5 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -513,22 +513,6 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
-                              int r_q10[MAX_MB_PLANE],
-                              int d_q10[MAX_MB_PLANE]) {
-  int i;
-  const int one_q10 = 1 << 10;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const int tmp = (xsq_q10[i] >> 2) + 8;
-    const int k = get_msb(tmp) - 3;
-    const int xq = (k << 3) + ((tmp >> k) & 0x7);
-    const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
-    const int b_q10 = one_q10 - a_q10;
-    r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
-    d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
-  }
-}
-
 static const uint32_t MAX_XSQ_Q10 = 245727;
 
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
@@ -554,30 +538,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
   }
 }
 
-// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
-// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
-                                      unsigned int n_log2[MAX_MB_PLANE],
-                                      unsigned int qstep[MAX_MB_PLANE],
-                                      int64_t *rate_sum, int64_t *dist_sum) {
-  int i;
-  int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const uint64_t xsq_q10_64 =
-        (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
-        var[i];
-    xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
-  }
-  model_rd_norm_vec(xsq_q10, r_q10, d_q10);
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    int rate =
-        ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
-    int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
-    *rate_sum += rate;
-    *dist_sum += dist;
-  }
-}
-
 // Disable gcc 12.2 false positive warning.
 // warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
 #if defined(__GNUC__) && !defined(__clang__)
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index efd854edf4..6c61ae514a 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -164,11 +164,6 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
-                                      unsigned int n_log2[MAX_MB_PLANE],
-                                      unsigned int qstep[MAX_MB_PLANE],
-                                      int64_t *rate_sum, int64_t *dist_sum);
-
 int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
                             const MACROBLOCKD *const xd);
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 201bf416db..309341682b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -160,10 +160,12 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
-                            MACROBLOCKD *xd, int *out_rate_sum,
-                            int64_t *out_dist_sum, int *skip_txfm_sb,
-                            int64_t *skip_sse_sb) {
+static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, MACROBLOCK *x,
+                                     MACROBLOCKD *xd, int *out_rate_sum,
+                                     int64_t *out_dist_sum, int *skip_txfm_sb,
+                                     int64_t *skip_sse_sb, int do_earlyterm,
+                                     int64_t best_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -176,19 +178,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   int64_t total_sse = 0;
   int skip_flag = 1;
   const int shift = 6;
-  int64_t dist;
   const int dequant_shift =
 #if CONFIG_VP9_HIGHBITDEPTH
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
                                                     3;
-  unsigned int qstep_vec[MAX_MB_PLANE];
-  unsigned int nlog2_vec[MAX_MB_PLANE];
-  unsigned int sum_sse_vec[MAX_MB_PLANE];
-  int any_zero_sum_sse = 0;
 
   x->pred_sse[ref] = 0;
 
+  // Build prediction signal, compute stats and RD cost on per-plane basis
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
@@ -207,7 +205,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     int idx, idy;
     int lw = b_width_log2_lookup[unit_size] + 2;
     int lh = b_height_log2_lookup[unit_size] + 2;
+    unsigned int qstep;
+    unsigned int nlog2;
+    int64_t dist = 0;
 
+    // Build inter predictor
+    vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+
+    // Compute useful stats
     for (idy = 0; idy < bh; ++idy) {
       for (idx = 0; idx < bw; ++idx) {
         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
@@ -243,46 +248,36 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     }
 
     total_sse += sum_sse;
-    sum_sse_vec[i] = sum_sse;
-    any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
-    qstep_vec[i] = pd->dequant[1] >> dequant_shift;
-    nlog2_vec[i] = num_pels_log2_lookup[bs];
-  }
+    qstep = pd->dequant[1] >> dequant_shift;
+    nlog2 = num_pels_log2_lookup[bs];
 
-  // Fast approximate the modelling function.
-  if (cpi->sf.simple_model_rd_from_var) {
-    for (i = 0; i < MAX_MB_PLANE; ++i) {
+    // Fast approximate the modelling function.
+    if (cpi->sf.simple_model_rd_from_var) {
       int64_t rate;
-      const int64_t square_error = sum_sse_vec[i];
-      int quantizer = qstep_vec[i];
-
-      if (quantizer < 120)
-        rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
+      if (qstep < 120)
+        rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT);
       else
         rate = 0;
-      dist = (square_error * quantizer) >> 8;
+      dist = ((int64_t)sum_sse * qstep) >> 8;
       rate_sum += rate;
-      dist_sum += dist;
-    }
-  } else {
-    if (any_zero_sum_sse) {
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        int rate;
-        vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
-                                     &rate, &dist);
-        rate_sum += rate;
-        dist_sum += dist;
-      }
     } else {
-      vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
-                                       &rate_sum, &dist_sum);
+      int rate;
+      vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist);
+      rate_sum += rate;
+    }
+    dist_sum += dist;
+    if (do_earlyterm) {
+      if (RDCOST(x->rdmult, x->rddiv, rate_sum,
+                 dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd)
+        return 1;
     }
   }
-
   *skip_txfm_sb = skip_flag;
   *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
   *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
+
+  return 0;
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -2964,6 +2959,9 @@ static int64_t handle_inter_mode(
         int64_t rs_rd;
         int tmp_skip_sb = 0;
         int64_t tmp_skip_sse = INT64_MAX;
+        const int enable_earlyterm =
+            cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i;
+        int64_t filt_best_rd;
 
         mi->interp_filter = i;
         rs = vp9_get_switchable_rate(cpi, xd);
@@ -2997,9 +2995,16 @@ static int64_t handle_inter_mode(
               xd->plane[j].dst.stride = 64;
             }
           }
-          vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb,
-                          &tmp_skip_sse);
+          // Compute RD cost with early termination option
+          filt_best_rd =
+              cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
+          if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd,
+                                        &rate_sum, &dist_sum, &tmp_skip_sb,
+                                        &tmp_skip_sse, enable_earlyterm,
+                                        filt_best_rd)) {
+            filter_cache[i] = INT64_MAX;
+            continue;
+          }
 
           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
           filter_cache[i] = rd;
@@ -3067,9 +3072,9 @@ static int64_t handle_inter_mode(
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
-                    &skip_sse_sb);
+    model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate,
+                              &tmp_dist, &skip_txfm_sb, &skip_sse_sb,
+                              0 /*do_earlyterm*/, INT64_MAX);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     memcpy(bsse, x->bsse, sizeof(bsse));
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index ce83a97626..f19385b6a8 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -227,6 +227,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->temporal_filter_search_method = NSTEP;
   sf->tx_size_search_breakout = 1;
   sf->use_square_partition_only = !boosted;
+  sf->early_term_interp_search_plane_rd = 1;
 
   sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
 
@@ -919,6 +920,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
   sf->cb_pred_filter_search = 0;
+  sf->early_term_interp_search_plane_rd = 0;
   sf->cb_partition_search = 0;
   sf->motion_field_mode_search = 0;
   sf->alt_ref_search_fp = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index c2ae970b77..bd8e658cfd 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -402,6 +402,10 @@ typedef struct SPEED_FEATURES {
   // Chessboard pattern prediction filter type search
   int cb_pred_filter_search;
 
+  // This variable enables an early termination of interpolation filter eval
+  // based on the current rd cost after processing each plane
+  int early_term_interp_search_plane_rd;
+
   int cb_partition_search;
 
   int motion_field_mode_search;

From b9933679bf0088527f2f762bad50a4bb7820bdf1 Mon Sep 17 00:00:00 2001
From: Neeraj Gadgil <neeraj.gadgil@ittiam.com>
Date: Wed, 1 Mar 2023 20:41:37 +0530
Subject: [PATCH 585/926] Use cb pattern for interp eval when filter is not
 switchable

This CL uses a checkerboard pattern for interp filter eval when
the filter is not switchable.

                 Instruction Count        BD-Rate Loss(%)
cpu   Resolution   Reduction(%)    avg.psnr   ovr.psnr    ssim
 0       LOWRES2      0.725         0.0017    -0.0000    0.0192
 0       MIDRES2      0.968         0.0004     0.0504    0.0810
 0        HDRES2      1.135         0.0089     0.0130    0.0113
 0       Average      0.943         0.0037     0.0211    0.0372

STATS_CHANGED

Change-Id: Ia713e5170101302f264ffaa2350bc0ab15c27090
---
 vp9/encoder/vp9_rdopt.c          | 20 ++++++++++++--------
 vp9/encoder/vp9_speed_features.c | 13 +++++++------
 vp9/encoder/vp9_speed_features.h |  6 +++++-
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 309341682b..f87ab3e0bc 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2801,13 +2801,12 @@ static int64_t handle_inter_mode(
   uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 };
   int64_t bsse[MAX_MB_PLANE << 2] = { 0 };
 
-  int bsl = mi_width_log2_lookup[bsize];
-  int pred_filter_search =
-      cpi->sf.cb_pred_filter_search
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
+  const int bsl = mi_width_log2_lookup[bsize];
+  const int blk_parity = (((mi_row + mi_col) >> bsl) +
+                          get_chessboard_index(cm->current_video_frame)) &
+                         0x1;
+  const int pred_filter_search =
+      (cpi->sf.cb_pred_filter_search >= 2) && blk_parity;
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
@@ -2947,9 +2946,14 @@ static int64_t handle_inter_mode(
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
 
   if (cm->interp_filter != BILINEAR) {
+    // Use cb pattern for filter eval when filter is not switchable
+    const int enable_interp_search =
+        (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE)
+            ? blk_parity
+            : 1;
     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
       best_filter = EIGHTTAP;
-    } else if (best_filter == SWITCHABLE) {
+    } else if (best_filter == SWITCHABLE && enable_interp_search) {
       int newbest;
       int tmp_rate_sum = 0;
       int64_t tmp_dist_sum = 0;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index f19385b6a8..f47e3d71c9 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -159,7 +159,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
       sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
       sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
       sf->alt_ref_search_fp = 1;
-      sf->cb_pred_filter_search = 1;
+      sf->cb_pred_filter_search = 2;
       sf->adaptive_interp_filter_search = 1;
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
     }
@@ -228,6 +228,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->tx_size_search_breakout = 1;
   sf->use_square_partition_only = !boosted;
   sf->early_term_interp_search_plane_rd = 1;
+  sf->cb_pred_filter_search = 1;
 
   sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
 
@@ -346,7 +347,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
     sf->cb_partition_search = !boosted;
-    sf->cb_pred_filter_search = 1;
+    sf->cb_pred_filter_search = 2;
     sf->alt_ref_search_fp = 1;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
@@ -639,7 +640,7 @@ static void set_rt_speed_feature_framesize_independent(
       sf->use_altref_onepass = 1;
       sf->use_compound_nonrd_pickmode = 1;
     }
-    if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2;
     if (!cpi->external_resize) sf->use_source_sad = 1;
   }
 
@@ -729,7 +730,7 @@ static void set_rt_speed_feature_framesize_independent(
     if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
         svc->temporal_layer_id > 0)
       cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
-    if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2;
   }
 
   if (speed >= 8) {
@@ -773,7 +774,7 @@ static void set_rt_speed_feature_framesize_independent(
     }
     sf->limit_newmv_early_exit = 0;
     sf->use_simple_block_yrd = 1;
-    if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2;
   }
 
   if (speed >= 9) {
@@ -783,7 +784,7 @@ static void set_rt_speed_feature_framesize_independent(
       for (i = 0; i < BLOCK_SIZES; ++i)
         sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
     }
-    sf->cb_pred_filter_search = 1;
+    sf->cb_pred_filter_search = 2;
     sf->mv.enable_adaptive_subpel_force_stop = 1;
     sf->mv.adapt_subpel_force_stop.mv_thresh = 1;
     sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index bd8e658cfd..e30a26084a 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -399,7 +399,11 @@ typedef struct SPEED_FEATURES {
   // Adaptive prediction mode search
   int adaptive_mode_search;
 
-  // Chessboard pattern prediction filter type search
+  // Chessboard pattern prediction for interp filter. Aggressiveness increases
+  // with levels.
+  // 0: disable
+  // 1: cb pattern in eval when filter is not switchable
+  // 2: cb pattern prediction for filter search
   int cb_pred_filter_search;
 
   // This variable enables an early termination of interpolation filter eval

From 5ae84ea5ae548314cfef982c95a4c9dbdfa79f6c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 1 Mar 2023 10:06:01 +0000
Subject: [PATCH 586/926] Optimize vp9_block_error_fp_neon

Currently vp9_block_error_fp_neon is only used when
CONFIG_VP9_HIGHBITDEPTH is set to false. This patch optimizes the
implementation and uses tran_low_t instead of int16_t so that the
function can also be used in builds where vp9_highbitdepth is enabled.

Change-Id: Ibab7ec5f74b7652fa2ae5edf328f9ec587088fd3
---
 test/avg_test.cc                      |  4 --
 vp9/common/vp9_rtcd_defs.pl           |  5 +--
 vp9/encoder/arm/neon/vp9_error_neon.c | 54 +++++++++++++++++----------
 vp9/vp9cx.mk                          |  2 -
 vpx_dsp/arm/sum_neon.h                |  8 ++++
 5 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/test/avg_test.cc b/test/avg_test.cc
index 196522ce58..bcf8d0d993 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -694,16 +694,12 @@ INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
                                            make_tuple(256, &vpx_satd_neon),
                                            make_tuple(1024, &vpx_satd_neon)));
 
-// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are
-// in place.
-#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     NEON, BlockErrorTestFP,
     ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
                       make_tuple(64, &vp9_block_error_fp_neon),
                       make_tuple(256, &vp9_block_error_fp_neon),
                       make_tuple(1024, &vp9_block_error_fp_neon)));
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 20a482c85f..c939411a3c 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -127,6 +127,7 @@ ()
 add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
 
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
+specialize qw/vp9_block_error_fp neon avx2 sse2/;
 
 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
@@ -137,14 +138,10 @@ ()
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_block_error avx2 sse2/;
 
-  specialize qw/vp9_block_error_fp avx2 sse2/;
-
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error sse2/;
 } else {
   specialize qw/vp9_block_error avx2 msa sse2/;
-
-  specialize qw/vp9_block_error_fp neon avx2 sse2/;
 }
 
 # fdct functions
diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c
index 1c7503139e..eb1e2e03d0 100644
--- a/vp9/encoder/arm/neon/vp9_error_neon.c
+++ b/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -12,30 +12,44 @@
 #include <assert.h>
 
 #include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
-  int64x2_t error = vdupq_n_s64(0);
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff,
+                                const tran_low_t *dqcoeff, int block_size) {
+  uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
 
-  assert(block_size >= 8);
-  assert((block_size % 8) == 0);
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
 
   do {
-    const int16x8_t c = vld1q_s16(coeff);
-    const int16x8_t d = vld1q_s16(dqcoeff);
-    const int16x8_t diff = vsubq_s16(c, d);
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
-    // accumulating them in 64-bits.
-    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
-    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
-    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
-    error = vaddq_s64(error, err2);
-    coeff += 8;
-    dqcoeff += 8;
-    block_size -= 8;
+    uint32x4_t err0, err1;
+
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits
+    // before accumulating them in 64-bits. However splitting into 2 mull, mlal
+    // pairs is beneficial since it allows us to use both Neon
+    // multiply-accumulate pipes - on CPUs that have them - rather than having
+    // a single chain of 4 instructions executing serially.
+    err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0));
+    err_u64[0] = vpadalq_u32(err_u64[0], err0);
+
+    err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1));
+    err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64[1] = vpadalq_u32(err_u64[1], err1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
   } while (block_size != 0);
 
-  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+  return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1]));
 }
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index ae8fb85d87..cccaea712e 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -136,9 +136,7 @@ endif
 
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
-endif
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 47748a8061..6f513ca7a8 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -127,4 +127,12 @@ static INLINE uint64_t horizontal_add_int64x2(const int64x2_t a) {
 #endif
 }
 
+static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u64(a);
+#else
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_

From 57c6ea97522146e9471a3537304ce8a0a7a22ea0 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 3 Mar 2023 10:53:27 +0000
Subject: [PATCH 587/926] Fix return type of horizontal_add_int64x2 helper

horizontal_add_int64x2 was incorrectly returning a uint64_t instead of
an int64_t. This patch fixes that.

Change-Id: Ic6016cf87aebfc6a14f540b784d6648757e12b49
---
 vpx_dsp/arm/highbd_variance_neon.c | 2 +-
 vpx_dsp/arm/sum_neon.h             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
index d0b366c95b..75fde676a0 100644
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -167,7 +167,7 @@ static INLINE void highbd_variance_xlarge_neon(
   } while (i < h);
 
   *sum = horizontal_add_int32x4(sum_s32);
-  *sse = horizontal_add_int64x2(sse_s64);
+  *sse = (uint64_t)horizontal_add_int64x2(sse_s64);
 }
 
 static INLINE void highbd_variance_32xh_xlarge_neon(
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 6f513ca7a8..8291f07296 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -119,7 +119,7 @@ static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
 #endif
 }
 
-static INLINE uint64_t horizontal_add_int64x2(const int64x2_t a) {
+static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
 #if defined(__aarch64__)
   return vaddvq_s64(a);
 #else

From eec48083936b52bc0ec9adfc452d29b177366d75 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 6 Mar 2023 11:37:26 +0000
Subject: [PATCH 588/926] Add Neon implementation of vp9_block_error_c

Add Neon implementation of vp9_block_error_c as well as the
corresponding tests.

Change-Id: I79247b5ae24f51b7b55fc5e517d5e403dc86367a
---
 test/test.mk                          |  2 +-
 test/vp9_block_error_test.cc          |  8 +++++
 vp9/common/vp9_rtcd_defs.pl           |  4 +--
 vp9/encoder/arm/neon/vp9_error_neon.c | 47 +++++++++++++++++++++++++++
 4 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/test/test.mk b/test/test.mk
index f60d8f823f..3c225bc750 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -179,7 +179,7 @@ ifneq ($(CONFIG_REALTIME_ONLY),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2)))
+ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON)))
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc
index b93b014e65..bde84cd619 100644
--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@@ -197,4 +197,12 @@ INSTANTIATE_TEST_SUITE_P(
                                  &BlockError8BitWrapper<vp9_block_error_c>,
                                  VPX_BITS_8)));
 #endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BlockErrorTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>,
+                                 &BlockError8BitWrapper<vp9_block_error_c>,
+                                 VPX_BITS_8)));
+#endif
 }  // namespace
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index c939411a3c..2f9870dd48 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -136,12 +136,12 @@ ()
 specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  specialize qw/vp9_block_error avx2 sse2/;
+  specialize qw/vp9_block_error neon avx2 sse2/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error sse2/;
 } else {
-  specialize qw/vp9_block_error avx2 msa sse2/;
+  specialize qw/vp9_block_error neon avx2 msa sse2/;
 }
 
 # fdct functions
diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c
index eb1e2e03d0..0cf0bf250e 100644
--- a/vp9/encoder/arm/neon/vp9_error_neon.c
+++ b/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -15,6 +15,53 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    uint32x4_t err;
+    int32x4_t ssz0, ssz1;
+
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before
+    // accumulating them in 64-bits.
+    err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+    err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+    err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64 = vpadalq_u32(err_u64, err);
+
+    // We can't do the same here as we're operating on signed integers, so we
+    // can store 2 15-bit diff before accumulating into 64-bits.
+    ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+    ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+    ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+    ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  *ssz = horizontal_add_int64x2(ssz_s64);
+  return (int64_t)horizontal_add_uint64x2(err_u64);
+}
+
 int64_t vp9_block_error_fp_neon(const tran_low_t *coeff,
                                 const tran_low_t *dqcoeff, int block_size) {
   uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };

From b7fabadc5d230dd78922b048a39a15c038429782 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Thu, 2 Mar 2023 10:58:27 +0530
Subject: [PATCH 589/926] Add AVX2 for vpx_filter_block1d8_h8() function

Introduced AVX2 intrinsic to compute convolve horizontal for
w = 8 case. This is a bit-exact change.

                 Instruction Count
cpu   Resolution   Reduction(%)
 0       LOWRES2      1.509
 0       MIDRES2      1.165
 0        HDRES2      0.898
 0       Average      1.191

Change-Id: I699c94aa3d7ea74c58f901df906eed0b81b4ee79
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 81 ++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 841db7cd71..26e82f9b73 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/x86/convolve.h"
 #include "vpx_dsp/x86/convolve_avx2.h"
 #include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
 #include "vpx_ports/mem.h"
 
 // filters for 16_h8
@@ -38,6 +39,31 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
+#define CALC_CONVOLVE8_HORZ_ROW                                               \
+  srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);             \
+  s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]);                               \
+  s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]);                               \
+  s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]);                               \
+  s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]);                               \
+  s1[0] = convolve8_16_avx2(s1, f1);                                          \
+  s1[0] = _mm256_packus_epi16(s1[0], s1[0]);                                  \
+  src_ptr += src_stride;                                                      \
+  _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \
+  output_ptr += output_pitch;                                                 \
+  _mm_storel_epi64((__m128i *)&output_ptr[0],                                 \
+                   _mm256_extractf128_si256(s1[0], 1));                       \
+  output_ptr += output_pitch;
+
+// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+
+  // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
+}
+
 static INLINE void vpx_filter_block1d16_h8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
@@ -177,6 +203,59 @@ static void vpx_filter_block1d16_h8_avg_avx2(
                                  output_height, filter, 1);
 }
 
+static void vpx_filter_block1d8_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i filt[4], f1[4], s1[4], srcReg;
+  __m128i f[4], s[4];
+  int y = output_height;
+
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  shuffle_filter_avx2(filter, f1);
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // Process next 4 rows
+  while (y > 3) {
+    CALC_CONVOLVE8_HORZ_ROW
+    CALC_CONVOLVE8_HORZ_ROW
+    y -= 4;
+  }
+
+  // If remaining, then process 2 rows at a time
+  while (y > 1) {
+    CALC_CONVOLVE8_HORZ_ROW
+    y -= 2;
+  }
+
+  // For the remaining height.
+  if (y > 0) {
+    const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    f[0] = _mm256_castsi256_si128(f1[0]);
+    f[1] = _mm256_castsi256_si128(f1[1]);
+    f[2] = _mm256_castsi256_si128(f1[2]);
+    f[3] = _mm256_castsi256_si128(f1[3]);
+
+    // filter the source buffer
+    s[0] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]));
+    s[1] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]));
+    s[2] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]));
+    s[3] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]));
+    s[0] = convolve8_8_ssse3(s, f);
+
+    // Saturate 16bit value to 8bit.
+    s[0] = _mm_packus_epi16(s[0], s[0]);
+
+    // Save only 8 bytes
+    _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+  }
+}
+
 static INLINE void vpx_filter_block1d16_v8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
@@ -870,14 +949,12 @@ filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
 #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
 #else  // VPX_ARCH_X86
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
 #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;

From e33d4c276d71d982987d564cd9b962227d1631b6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 7 Mar 2023 22:09:37 -0800
Subject: [PATCH 590/926] disable vpx_highbd_*_sub_pixel_variance4x{4,8}_neon

vpx_highbd_8_sub_pixel_variance4x4_neon
vpx_highbd_8_sub_pixel_variance4x8_neon
vpx_highbd_10_sub_pixel_variance4x4_neon
vpx_highbd_10_sub_pixel_variance4x8_neon
vpx_highbd_12_sub_pixel_variance4x4_neon
vpx_highbd_12_sub_pixel_variance4x8_neon

all cause heap overflows of the form:

[ RUN      ] NEON/VpxHBDSubpelVarianceTest.Ref/24
=================================================================
==450528==ERROR: AddressSanitizer: heap-buffer-overflow on address
0xffff8311a571 at pc 0x0000010ca52c bp 0xffffc63e96b0 sp 0xffffc63e96a8
READ of size 8 at 0xffff8311a571 thread T0
    #0 0x10ca528 in load_unaligned_u16q vpx_dsp/arm/mem_neon.h:176:3
    #1 0x10ca528 in highbd_var_filter_block2d_bil_w4
       vpx_dsp/arm/highbd_subpel_variance_neon.c:49:21
    #2 0x10ca528 in vpx_highbd_10_sub_pixel_variance4x8_neon
       vpx_dsp/arm/highbd_subpel_variance_neon.c:257:1
    ...

0xffff8311a571 is located 0 bytes to the right of 113-byte region
[0xffff8311a500,0xffff8311a571)
allocated by thread T0 here:
    #0 0x5f18b0 in malloc (test_libvpx+0x5f18b0)
    #1 0xce4f90 in vpx_memalign vpx_mem/vpx_mem.c:62:10
    #2 0xce4f90 in vpx_malloc vpx_mem/vpx_mem.c:70:40
    #3 0xa4ad44 in (anonymous namespace)::SubpelVarianceTest<unsigned
       int (*)(unsigned char const*, int, int, int, unsigned char
       const*, int, unsigned int*)>::SetUp() test/variance_test.cc:586:14

Bug: webm:1796
Change-Id: I39f7f936bae2bcbbe1f803fb10375ec02d1c1277
---
 test/variance_test.cc                     | 14 +++++++++----
 vpx_dsp/arm/highbd_subpel_variance_neon.c | 15 ++++++++------
 vpx_dsp/vpx_dsp_rtcd_defs.pl              | 24 +++++++++++++++++------
 3 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 1359bc4baf..144c2d2901 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1591,10 +1591,12 @@ INSTANTIATE_TEST_SUITE_P(
                              12),
         SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
                              12),
+        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
+          fixed.
         SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
                              12),
         SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
-                             12),
+                             12),*/
         SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
                              10),
         SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
@@ -1617,10 +1619,12 @@ INSTANTIATE_TEST_SUITE_P(
                              10),
         SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
                              10),
+        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
+          fixed.
         SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
                              10),
         SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
-                             10),
+                             10),*/
         SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
                              8),
         SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
@@ -1640,10 +1644,12 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
                              8),
         SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
-        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8)
+        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
+          fixed.
         SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
         SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
-                             8)));
+                             8)*/));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDSubpelAvgVarianceTest,
diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
index aa64697458..c081d520ab 100644
--- a/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -234,8 +234,9 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
 // padding.
 
 // 8-bit
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2)
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2)
+// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
+// HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2)
+// HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2)
 
 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1)
 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1)
@@ -253,8 +254,9 @@ HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1)
 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1)
 
 // 10-bit
-HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2)
-HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2)
+// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
+// HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2)
+// HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2)
 
 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1)
 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1)
@@ -272,8 +274,9 @@ HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1)
 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1)
 
 // 12-bit
-HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2)
-HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2)
+// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
+// HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2)
+// HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2)
 
 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1)
 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ad8ff6e184..ad46499748 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1443,9 +1443,13 @@ ()
   specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
@@ -1481,9 +1485,13 @@ ()
   specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
@@ -1519,9 +1527,13 @@ ()
   specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;

From a47967700d5d6fe4172c7fa48ad95b3e4ba39982 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 8 Mar 2023 13:17:17 -0800
Subject: [PATCH 591/926] disable
 vpx_highbd_*_sub_pixel_avg_variance4x{4,8}_neon

vpx_highbd_8_sub_pixel_avg_variance4x4_neon
vpx_highbd_8_sub_pixel_avg_variance4x8_neon
vpx_highbd_10_sub_pixel_avg_variance4x4_neon
vpx_highbd_10_sub_pixel_avg_variance4x8_neon
vpx_highbd_12_sub_pixel_avg_variance4x4_neon
vpx_highbd_12_sub_pixel_avg_variance4x8_neon

all cause heap overflows of the form:

i[ RUN      ] NEON/VpxHBDSubpelAvgVarianceTest.Ref/33
=================================================================
==535205==ERROR: AddressSanitizer: heap-buffer-overflow on address
0xffff95bb0b89 at pc 0x00000116dabc bp 0xffffd09f6430 sp 0xffffd09f6428
READ of size 8 at 0xffff95bb0b89 thread T0
    #0 0x116dab8 in load_unaligned_u16q vpx_dsp/arm/mem_neon.h:176:3
    #1 0x116dab8 in highbd_var_filter_block2d_bil_w4
       vpx_dsp/arm/highbd_subpel_variance_neon.c:49:21
    #2 0x116dab8 in vpx_highbd_8_sub_pixel_avg_variance4x4_neon
       vpx_dsp/arm/highbd_subpel_variance_neon.c:543:1
    ...

0xffff95bb0b89 is located 0 bytes to the right of 73-byte region
[0xffff95bb0b40,0xffff95bb0b89)
allocated by thread T0 here:
    #0 0x5f18b0 in malloc (test_libvpx+0x5f18b0)
    #1 0xce4a40 in vpx_memalign vpx_mem/vpx_mem.c:62:10
    #2 0xce4a40 in vpx_malloc vpx_mem/vpx_mem.c:70:40
    #3 0xa52238 in (anonymous namespace)::SubpelVarianceTest<unsigned
       int (*)(unsigned char const*, int, int, int, unsigned char
               const*, int, unsigned int*, unsigned char
               const*)>::SetUp()
       test/variance_test.cc:586:14
    ...

This is the same issue as:
  e33d4c276 disable vpx_highbd_*_sub_pixel_variance4x{4,8}_neon
They have highbd_var_filter_block2d_bil_w4 in common.

Bug: webm:1796
Change-Id: I3ed70d0ba22e127720542612ea9f6665948eedfc
---
 test/variance_test.cc                     | 14 +++++++++----
 vpx_dsp/arm/highbd_subpel_variance_neon.c | 21 ++++++++++++++------
 vpx_dsp/vpx_dsp_rtcd_defs.pl              | 24 +++++++++++++++++------
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 144c2d2901..8af26969c6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1687,12 +1687,14 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
                                 12),
+        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
+          fixed.
         SubpelAvgVarianceParams(2, 3,
                                 &vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
                                 12),
         SubpelAvgVarianceParams(2, 2,
                                 &vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
-                                12),
+                                12),*/
         SubpelAvgVarianceParams(6, 6,
                                 &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
                                 10),
@@ -1726,12 +1728,14 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
                                 10),
+        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
+          fixed.
         SubpelAvgVarianceParams(2, 3,
                                 &vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
                                 10),
         SubpelAvgVarianceParams(2, 2,
                                 &vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
-                                10),
+                                10),*/
         SubpelAvgVarianceParams(6, 6,
                                 &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
                                 8),
@@ -1764,13 +1768,15 @@ INSTANTIATE_TEST_SUITE_P(
                                 8),
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
-                                8),
+                                8)
+        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
+          fixed.
         SubpelAvgVarianceParams(2, 3,
                                 &vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
                                 8),
         SubpelAvgVarianceParams(2, 2,
                                 &vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
-                                8)));
+                                8)*/));
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
index c081d520ab..b2fe9921c9 100644
--- a/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -37,6 +37,8 @@
 // 15-bit.)
 
 // Process a block exactly 4 wide and a multiple of 2 high.
+// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
+#if 0
 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
                                              uint16_t *dst_ptr, int src_stride,
                                              int pixel_step, int dst_height,
@@ -60,6 +62,7 @@ static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
     i -= 2;
   } while (i != 0);
 }
+#endif  // 0
 
 // Process a block which is a multiple of 8 and any height.
 static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
@@ -295,6 +298,8 @@ HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1)
 
 // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
 // width 4.
+// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
+#if 0
 static void highbd_avg_pred_var_filter_block2d_bil_w4(
     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
     int dst_height, int filter_offset, const uint16_t *second_pred) {
@@ -319,6 +324,7 @@ static void highbd_avg_pred_var_filter_block2d_bil_w4(
     i -= 2;
   } while (i != 0);
 }
+#endif  // 0
 
 // Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
 static void highbd_avg_pred_var_filter_block2d_bil_large(
@@ -540,8 +546,9 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
 // padding.
 
 // 8-bit
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2)
+// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
+// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2)
+// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2)
 
 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1)
 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1)
@@ -559,8 +566,9 @@ HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1)
 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1)
 
 // 10-bit
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2)
+// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
+// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2)
+// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2)
 
 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1)
 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1)
@@ -578,8 +586,9 @@ HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1)
 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1)
 
 // 12-bit
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2)
+// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
+// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2)
+// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2)
 
 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1)
 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ad46499748..62f4789c2c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1569,9 +1569,13 @@ ()
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
@@ -1607,9 +1611,13 @@ ()
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
@@ -1645,9 +1653,13 @@ ()
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
+  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
+  # fixed.
+  # specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
 
 }  # CONFIG_VP9_HIGHBITDEPTH
 

From eab52a4f3c5bece97b8a2656553903aacd8f7ab4 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 8 Mar 2023 16:34:20 +0000
Subject: [PATCH 592/926] Fix buffer overrun in highbd Neon subpel variance
 filters

The high bitdepth Neon code applying the first pass of the bilinear
filter for subpixel variance on blocks of width 4 processed two rows
at a time. This resulted in a source buffer overread, attempting to
produce two rows of padding for the second (vertical) pass of the
bilinear filter.

This patch modifies highbd_var_filter_block2d_bil_w4 and
highbd_avg_pred_var_filter_block2d_bil_w4 such that they only process
a single row per iteration, and only require a single row of padding
for the second pass. This prevents the buffer overread.

Since all block sizes are now processed one row at a time, there is
no need for a "padding" macro parameter - the value is always 1, with
no special case for 4xh blocks. As well as re-enabling the Neon paths
and their associated tests, we remove the now-redundant 'padding'
macro parameter.

Bug: webm:1796
Change-Id: Icd6076b38eb4476139795bb1734ca800c9edf079
---
 test/variance_test.cc                     |  28 +-
 vpx_dsp/arm/highbd_subpel_variance_neon.c | 302 ++++++++++------------
 vpx_dsp/arm/mem_neon.h                    |   9 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  48 +---
 4 files changed, 170 insertions(+), 217 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 8af26969c6..1359bc4baf 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1591,12 +1591,10 @@ INSTANTIATE_TEST_SUITE_P(
                              12),
         SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
                              12),
-        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
-          fixed.
         SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
                              12),
         SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
-                             12),*/
+                             12),
         SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
                              10),
         SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
@@ -1619,12 +1617,10 @@ INSTANTIATE_TEST_SUITE_P(
                              10),
         SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
                              10),
-        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
-          fixed.
         SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
                              10),
         SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
-                             10),*/
+                             10),
         SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
                              8),
         SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
@@ -1644,12 +1640,10 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
                              8),
         SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
-        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8)
-        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
-          fixed.
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
         SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
         SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
-                             8)*/));
+                             8)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDSubpelAvgVarianceTest,
@@ -1687,14 +1681,12 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
                                 12),
-        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
-          fixed.
         SubpelAvgVarianceParams(2, 3,
                                 &vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
                                 12),
         SubpelAvgVarianceParams(2, 2,
                                 &vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
-                                12),*/
+                                12),
         SubpelAvgVarianceParams(6, 6,
                                 &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
                                 10),
@@ -1728,14 +1720,12 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
                                 10),
-        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
-          fixed.
         SubpelAvgVarianceParams(2, 3,
                                 &vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
                                 10),
         SubpelAvgVarianceParams(2, 2,
                                 &vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
-                                10),*/
+                                10),
         SubpelAvgVarianceParams(6, 6,
                                 &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
                                 8),
@@ -1768,15 +1758,13 @@ INSTANTIATE_TEST_SUITE_P(
                                 8),
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
-                                8)
-        /*TODO(https://crbug.com/webm/1796): enable after heap overflow is
-          fixed.
+                                8),
         SubpelAvgVarianceParams(2, 3,
                                 &vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
                                 8),
         SubpelAvgVarianceParams(2, 2,
                                 &vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
-                                8)*/));
+                                8)));
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
index b2fe9921c9..683df5797a 100644
--- a/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -36,33 +36,29 @@
 // requiring double the number of data processing instructions. (12-bit * 8 =
 // 15-bit.)
 
-// Process a block exactly 4 wide and a multiple of 2 high.
-// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
-#if 0
+// Process a block exactly 4 wide and any height.
 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
                                              uint16_t *dst_ptr, int src_stride,
                                              int pixel_step, int dst_height,
                                              int filter_offset) {
-  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
-  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
 
   int i = dst_height;
   do {
-    uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
-    uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
+    uint16x4_t s0 = load_unaligned_u16(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
 
-    uint16x8_t blend = vmulq_u16(s0, f0);
-    blend = vmlaq_u16(blend, s1, f1);
-    blend = vrshrq_n_u16(blend, 3);
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
 
-    vst1q_u16(dst_ptr, blend);
+    vst1_u16(dst_ptr, blend);
 
-    src_ptr += 2 * src_stride;
-    dst_ptr += 8;
-    i -= 2;
-  } while (i != 0);
+    src_ptr += src_stride;
+    dst_ptr += 4;
+  } while (--i != 0);
 }
-#endif  // 0
 
 // Process a block which is a multiple of 8 and any height.
 static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
@@ -148,23 +144,23 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
   } while (--i != 0);
 }
 
-#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding)                  \
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
   unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
       const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
-    uint16_t tmp0[w * (h + padding)];                                          \
+    uint16_t tmp0[w * (h + 1)];                                                \
     uint16_t tmp1[w * h];                                                      \
     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
                                                                                \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
-                                       (h + padding), xoffset);                \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
     highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
                                                                                \
     return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
                                                      w, ref, ref_stride, sse); \
   }
 
-#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding)      \
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
   unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
       const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
@@ -188,28 +184,28 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
-      uint16_t tmp0[w * (h + padding)];                                        \
+      uint16_t tmp0[w * (h + 1)];                                              \
       if (yoffset == 0) {                                                      \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
-        uint16_t tmp1[w * (h + padding)];                                      \
+        uint16_t tmp1[w * (h + 1)];                                            \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
-                                      (h + padding));                          \
+                                      (h + 1));                                \
         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
-        uint16_t tmp1[w * (h + padding)];                                      \
+        uint16_t tmp1[w * (h + 1)];                                            \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
-                                      (h + padding));                          \
+                                      (h + 1));                                \
         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     } else {                                                                   \
-      uint16_t tmp0[w * (h + padding)];                                        \
+      uint16_t tmp0[w * (h + 1)];                                              \
       if (yoffset == 0) {                                                      \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
                                            xoffset);                           \
@@ -218,14 +214,14 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * h];                                                  \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
-                                           (h + padding), xoffset);            \
+                                           (h + 1), xoffset);                  \
         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * h];                                                  \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
-                                           (h + padding), xoffset);            \
+                                           (h + 1), xoffset);                  \
         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
@@ -233,98 +229,88 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
     }                                                                          \
   }
 
-// 4x<h> blocks are processed two rows at a time, so require an extra row of
-// padding.
-
 // 8-bit
-// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
-// HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2)
-// HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
 
 // 10-bit
-// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
-// HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2)
-// HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
 
 // 12-bit
-// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
-// HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2)
-// HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
 
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1)
-HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
 
 // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
 // width 4.
-// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
-#if 0
 static void highbd_avg_pred_var_filter_block2d_bil_w4(
     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
     int dst_height, int filter_offset, const uint16_t *second_pred) {
-  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
-  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
 
   int i = dst_height;
   do {
-    uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
-    uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
-    uint16x8_t p = vld1q_u16(second_pred);
+    uint16x4_t s0 = load_unaligned_u16(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
 
-    uint16x8_t blend = vmulq_u16(s0, f0);
-    blend = vmlaq_u16(blend, s1, f1);
-    blend = vrshrq_n_u16(blend, 3);
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
 
-    vst1q_u16(dst_ptr, vrhaddq_u16(blend, p));
+    vst1_u16(dst_ptr, vrhadd_u16(blend, p));
 
-    src_ptr += 2 * src_stride;
-    dst_ptr += 2 * 4;
-    second_pred += 2 * 4;
-    i -= 2;
-  } while (i != 0);
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
 }
-#endif  // 0
 
 // Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
 static void highbd_avg_pred_var_filter_block2d_bil_large(
@@ -444,25 +430,25 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
   } while (--i != 0);
 }
 
-#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding)          \
-  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                        \
-    uint16_t tmp0[w * (h + padding)];                                      \
-    uint16_t tmp1[w * h];                                                  \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                          \
-                                                                           \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
-                                       (h + padding), xoffset);            \
-    highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
-        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
-                                                                           \
-    return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
-        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                      \
+  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t tmp0[w * (h + 1)];                                               \
+    uint16_t tmp1[w * h];                                                     \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
+                                                                              \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+                                       xoffset);                              \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                              \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));      \
+                                                                              \
+    return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                   \
   }
 
-#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding)  \
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
   unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
       const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
       const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
@@ -490,7 +476,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
-      uint16_t tmp0[w * (h + padding)];                                        \
+      uint16_t tmp0[w * (h + 1)];                                              \
       if (yoffset == 0) {                                                      \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             src_ptr, tmp0, source_stride, 1, w, h,                             \
@@ -498,24 +484,24 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
-        uint16_t tmp1[w * (h + padding)];                                      \
+        uint16_t tmp1[w * (h + 1)];                                            \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
-                                      (h + padding));                          \
+                                      (h + 1));                                \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
-        uint16_t tmp1[w * (h + padding)];                                      \
+        uint16_t tmp1[w * (h + 1)];                                            \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
-                                      (h + padding));                          \
+                                      (h + 1));                                \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     } else {                                                                   \
-      uint16_t tmp0[w * (h + padding)];                                        \
+      uint16_t tmp0[w * (h + 1)];                                              \
       if (yoffset == 0) {                                                      \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
@@ -525,7 +511,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * h];                                                  \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
-                                           (h + padding), xoffset);            \
+                                           (h + 1), xoffset);                  \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
@@ -533,7 +519,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
       } else {                                                                 \
         uint16_t tmp1[w * h];                                                  \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
-                                           (h + padding), xoffset);            \
+                                           (h + 1), xoffset);                  \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
         return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
@@ -542,65 +528,59 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
     }                                                                          \
   }
 
-// 4x<h> blocks are processed two rows at a time, so require an extra row of
-// padding.
-
 // 8-bit
-// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
-// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2)
-// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
 
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
 
 // 10-bit
-// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
-// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2)
-// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
 
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
 
 // 12-bit
-// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed.
-// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2)
-// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
 
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1)
-HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
 
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32, 1)
-HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 400846b707..fa14f80b23 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -164,6 +164,15 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
   return vreinterpret_u8_u32(a_u32);
 }
 
+// Load 8 bytes when alignment is not guaranteed.
+static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
+  uint64_t a;
+  uint64x1_t a_u64 = vdup_n_u64(0);
+  memcpy(&a, buf, 8);
+  a_u64 = vset_lane_u64(a, a_u64, 0);
+  return vreinterpret_u16_u64(a_u64);
+}
+
 // Load 2 sets of 8 bytes when alignment is not guaranteed.
 static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
                                              ptrdiff_t stride) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 62f4789c2c..ad8ff6e184 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1443,13 +1443,9 @@ ()
   specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
@@ -1485,13 +1481,9 @@ ()
   specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
@@ -1527,13 +1519,9 @@ ()
   specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
@@ -1569,13 +1557,9 @@ ()
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
@@ -1611,13 +1595,9 @@ ()
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
@@ -1653,13 +1633,9 @@ ()
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is
-  # fixed.
-  # specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
 
 }  # CONFIG_VP9_HIGHBITDEPTH
 

From 4959770032bb5646d9106620906822d1260496cb Mon Sep 17 00:00:00 2001
From: Neeraj Gadgil <neeraj.gadgil@ittiam.com>
Date: Thu, 9 Mar 2023 14:51:44 +0530
Subject: [PATCH 593/926] Rename function 'model_rd_for_sb_earlyterm'

Function renamed as 'build_inter_pred_model_rd_earlyterm' and
added a comment to explain its behavior.

Change-Id: I804e6273558ba36241232f62cf18ea754b85e369
---
 vp9/encoder/vp9_rdopt.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f87ab3e0bc..bcadd5777e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -160,12 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize, MACROBLOCK *x,
-                                     MACROBLOCKD *xd, int *out_rate_sum,
-                                     int64_t *out_dist_sum, int *skip_txfm_sb,
-                                     int64_t *skip_sse_sb, int do_earlyterm,
-                                     int64_t best_rd) {
+// Planewise build inter prediction and compute rdcost with early termination
+// option
+static int build_inter_pred_model_rd_earlyterm(
+    VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x,
+    MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm,
+    int64_t best_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -2999,13 +3000,13 @@ static int64_t handle_inter_mode(
               xd->plane[j].dst.stride = 64;
             }
           }
-          // Compute RD cost with early termination option
+
           filt_best_rd =
               cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
-          if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd,
-                                        &rate_sum, &dist_sum, &tmp_skip_sb,
-                                        &tmp_skip_sse, enable_earlyterm,
-                                        filt_best_rd)) {
+          if (build_inter_pred_model_rd_earlyterm(
+                  cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum,
+                  &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm,
+                  filt_best_rd)) {
             filter_cache[i] = INT64_MAX;
             continue;
           }
@@ -3076,9 +3077,9 @@ static int64_t handle_inter_mode(
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
-    model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate,
-                              &tmp_dist, &skip_txfm_sb, &skip_sse_sb,
-                              0 /*do_earlyterm*/, INT64_MAX);
+    build_inter_pred_model_rd_earlyterm(
+        cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
+        &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     memcpy(bsse, x->bsse, sizeof(bsse));

From 775d594e462252c0e8f8113955122e9c34eeab44 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Mon, 6 Mar 2023 10:38:20 +0530
Subject: [PATCH 594/926] Add AVX2 for vpx_filter_block1d8_v8() function

Introduced AVX2 intrinsic to compute convolve vertical for
w = 8 case. This is a bit-exact change.

                 Instruction Count
cpu   Resolution   Reduction(%)
 0       LOWRES2      1.347
 0       MIDRES2      1.046
 0        HDRES2      0.805
 0       Average      1.066

Change-Id: Idf77fff054beaf2c985b9bf2335591bda47e811f
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 96 ++++++++++++++++++++++-
 1 file changed, 94 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 26e82f9b73..141614e7ad 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -942,19 +942,111 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
   }
 }
 
+static void vpx_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[8];
+  __m128i s[9];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
+
+  shuffle_filter_avx2(filter, f);
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  // merge the result together
+  // r[0]:    0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+  // r07 r06 r05 r04 r03 r02 r01 r00
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+  // r[1]:    0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+  // r17 r16 r15 r14 r13 r12 r11 r10
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+  // r[2]:    0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+  // r27 r26 r25 r24 r23 r22 r21 r20
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+  // r[3]:    0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+  // r37 r36 r35 r34 r33 r32 r31 r30
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+  // r[4]:    0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+  // r47 r46 r45 r44 r43 r42 r41 r40
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+  // r[5]:    0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+  // r57 r56 r55 r54 r53 r52 r51 r50
+  r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+  // Merge together
+  // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+  // r01|r10 r00|
+  ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+  // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+  // r21|r30 r20|
+  ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+  // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+  // r41|r50 r40|
+  ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+  // Process 2 rows at a time
+  do {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    // r[6]:    0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+    // 0 r67 r66 r65 r64 r63 r62 r61 r60
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+    // r[7]:    0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+    // 0 r77 r76 r75 r74 r73 r72 r71 r70
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+    // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+    // r62 | r71 r61|r70 r60|
+    ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+    ss[0] = convolve8_16_avx2(ss, f);
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    /* shift down two rows */
+    s[6] = s[8];
+    _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+    output_ptr += out_pitch;
+    _mm_storel_epi64((__m128i *)&output_ptr[0],
+                     _mm256_extractf128_si256(ss[0], 1));
+    output_ptr += out_pitch;
+    ss[0] = ss[1];
+    ss[1] = ss[2];
+    ss[2] = ss[3];
+    y -= 2;
+  } while (y > 1);
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
 #else  // VPX_ARCH_X86
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;

From 29beea82437213bd705de4b01103ac0d88bcab72 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 1 Mar 2023 23:37:32 +0000
Subject: [PATCH 595/926] [NEON] Add temporal filter functions, 8-bit and
 highbd

Both are around 3x faster than original C version. 8-bit gives a
small 0.5% speed increase, whereas highbd gives ~2.5%.

Change-Id: I71d75ddd2757b19aa201e879fd9fa8f3a25431ad
---
 test/yuv_temporal_filter_test.cc              |  18 +
 vp9/common/vp9_rtcd_defs.pl                   |   4 +-
 .../neon/vp9_highbd_temporal_filter_neon.c    | 872 ++++++++++++++++++
 .../arm/neon/vp9_temporal_filter_neon.c       | 849 +++++++++++++++++
 ...ants.h => vp9_temporal_filter_constants.h} |   8 +-
 vp9/encoder/x86/highbd_temporal_filter_sse4.c |   2 +-
 vp9/encoder/x86/temporal_filter_sse4.c        |   2 +-
 vp9/vp9cx.mk                                  |   9 +-
 8 files changed, 1754 insertions(+), 10 deletions(-)
 create mode 100644 vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
 create mode 100644 vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
 rename vp9/encoder/{x86/temporal_filter_constants.h => vp9_temporal_filter_constants.h} (98%)

diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc
index 2bdcf4d86f..91b4e804b3 100644
--- a/test/yuv_temporal_filter_test.cc
+++ b/test/yuv_temporal_filter_test.cc
@@ -694,6 +694,18 @@ INSTANTIATE_TEST_SUITE_P(
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12,
                              12)));
 #endif  // HAVE_SSE4_1
+#if HAVE_NEON
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 12)
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_10,
+                             10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_12,
+                             12)));
+#endif  // HAVE_NEON
 #else
 INSTANTIATE_TEST_SUITE_P(
     C, YUVTemporalFilterTest,
@@ -704,5 +716,11 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest,
                          ::testing::Values(TemporalFilterWithBd(
                              &vp9_apply_temporal_filter_sse4_1, 8)));
 #endif  // HAVE_SSE4_1
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, YUVTemporalFilterTest,
+                         ::testing::Values(TemporalFilterWithBd(
+                             &vp9_apply_temporal_filter_neon, 8)));
+#endif  // HAVE_NEON
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
 }  // namespace
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 2f9870dd48..d16b947110 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -179,11 +179,11 @@ ()
 #
 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
 add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
-specialize qw/vp9_apply_temporal_filter sse4_1/;
+specialize qw/vp9_apply_temporal_filter sse4_1 neon/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count";
-    specialize qw/vp9_highbd_apply_temporal_filter sse4_1/;
+    specialize qw/vp9_highbd_apply_temporal_filter sse4_1 neon/;
   }
 }
 
diff --git a/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
new file mode 100644
index 0000000000..c3aef3c865
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
@@ -0,0 +1,872 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const uint16x8_t a_reg = vld1q_u16(a);
+  const uint16x8_t b_reg = vld1q_u16(b);
+
+  uint16x8_t dist = vabdq_u16(a_reg, b_reg);
+  uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist));
+  uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist));
+
+  vst1q_u32(dst, dist_first);
+  vst1q_u32(dst + 4, dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) {
+  uint32x4_t dist_reg, dist_left, dist_right;
+
+  dist_reg = vld1q_u32(dist);
+  dist_left = vld1q_u32(dist - 1);
+  dist_right = vld1q_u32(dist + 1);
+
+  *sum = vaddq_u32(dist_reg, dist_left);
+  *sum = vaddq_u32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first,
+                                    uint32x4_t *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum,
+                                    const uint32x4_t *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32);
+  const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32);
+  const uint32x4_t weight_u32 = vdupq_n_u32(weight);
+  const uint32x4_t sixteen = vdupq_n_u32(16);
+  uint32x4_t sum2;
+
+  // modifier * 3 / index;
+  uint64x2_t sum_lo =
+      vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants));
+  uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum),
+                                vget_high_u32(*mul_constants));
+
+  // we cannot use vshrn_n_u64 as strength is not known at compile time.
+  sum_lo = vshlq_u64(sum_lo, strength_s64);
+  sum_hi = vshlq_u64(sum_hi, strength_s64);
+
+  sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi));
+
+  // Multiply with the weight
+  sum2 = vminq_u32(sum2, sixteen);
+  sum2 = vsubq_u32(sixteen, sum2);
+  *output = vmulq_u32(sum2, weight_u32);
+}
+
+static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1,
+                                    const uint32x4_t sum_0_u32,
+                                    const uint32x4_t sum_1_u32,
+                                    const uint32x4_t *mul_constants_0,
+                                    const uint32x4_t *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(
+    const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32,
+    const uint16_t *pred, uint16_t *count, uint32_t *accumulator) {
+  const uint16x8_t sum_u16 =
+      vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32));
+  uint16x8_t pred_u16 = vld1q_u16(pred);
+  uint16x8_t count_u16 = vld1q_u16(count);
+  uint32x4_t pred_0_u32, pred_1_u32;
+  uint32x4_t accum_0_u32, accum_1_u32;
+
+  count_u16 = vqaddq_u16(count_u16, sum_u16);
+  vst1q_u16(count, count_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+
+  pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16));
+  pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16));
+
+  // Don't use sum_u16 as that produces different results to the C version
+  accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32);
+  accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32);
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist,
+                                      uint32x4_t *dist_reg) {
+  *dist_reg = vld1q_u32(dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist,
+                                      uint32x4_t *reg_first,
+                                      uint32x4_t *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist,
+    uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first,
+    uint32x4_t *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    uint32x4_t u_reg, v_reg;
+    uint32x4x2_t pair;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    pair = vzipq_u32(u_reg, u_reg);
+    *u_first = pair.val[0];
+    *u_second = pair.val[1];
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    pair = vzipq_u32(v_reg, v_reg);
+    *v_first = pair.val[0];
+    *v_second = pair.val[1];
+  }
+}
+
+static void highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  uint32x4_t mul_first, mul_second;
+
+  uint32x4_t sum_row_1_first, sum_row_1_second;
+  uint32x4_t sum_row_2_first, sum_row_2_second;
+  uint32x4_t sum_row_3_first, sum_row_3_second;
+
+  uint32x4_t u_first, u_second;
+  uint32x4_t v_first, v_second;
+
+  uint32x4_t sum_row_first;
+  uint32x4_t sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = vld1q_u32(neighbors_first[0]);
+  mul_second = vld1q_u32(neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = vaddq_u32(sum_row_first, u_first);
+  sum_row_second = vaddq_u32(sum_row_second, u_second);
+
+  sum_row_first = vaddq_u32(sum_row_first, v_first);
+  sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                   sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = vld1q_u32(neighbors_first[1]);
+  mul_second = vld1q_u32(neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first);
+    sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = vaddq_u32(sum_row_first, u_first);
+    sum_row_second = vaddq_u32(sum_row_second, u_second);
+    sum_row_first = vaddq_u32(sum_row_first, v_first);
+    sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                     sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = vld1q_u32(neighbors_first[0]);
+  mul_second = vld1q_u32(neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = vaddq_u32(sum_row_first, u_first);
+  sum_row_second = vaddq_u32(sum_row_second, u_second);
+  sum_row_first = vaddq_u32(sum_row_first, v_first);
+  sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                   sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void highbd_apply_temporal_filter_luma(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst,
+    uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) {
+  uint32x4_t y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    uint32x4_t y_fst, y_snd;
+    uint64x2_t y_fst64, y_snd64;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = vaddq_u32(y_fst, y_tmp_fst);
+      y_snd = vaddq_u32(y_snd, y_tmp_snd);
+    }
+
+    y_fst64 = vpaddlq_u32(y_fst);
+    y_snd64 = vpaddlq_u32(y_snd);
+    y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = vaddq_u32(y_fst, y_tmp_fst);
+      y_snd = vaddq_u32(y_snd, y_tmp_snd);
+    }
+
+    y_fst64 = vpaddlq_u32(y_fst);
+    y_snd64 = vpaddlq_u32(y_snd);
+    y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+  }
+
+  *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  uint32x4_t mul_fst, mul_snd;
+
+  uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  uint32x4_t u_sum_row_fst, v_sum_row_fst;
+  uint32x4_t u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = vld1q_u32(neighbors_fst[0]);
+  mul_snd = vld1q_u32(neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                     u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                     v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = vld1q_u32(neighbors_fst[1]);
+  mul_snd = vld1q_u32(neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                       u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                       v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = vld1q_u32(neighbors_fst[0]);
+  mul_snd = vld1q_u32(neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                     u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                     v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void highbd_apply_temporal_filter_chroma(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_neon(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                    block_height, ss_x, ss_y, strength, blk_fw,
+                                    use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                    u_dist_ptr, v_dist_ptr);
+
+  highbd_apply_temporal_filter_chroma(
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
new file mode 100644
index 0000000000..a651a15d90
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
@@ -0,0 +1,849 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const uint8x8_t a_reg = vld1_u8(a);
+  const uint8x8_t b_reg = vld1_u8(b);
+
+  uint16x8_t dist_first = vabdl_u8(a_reg, b_reg);
+  dist_first = vmulq_u16(dist_first, dist_first);
+
+  vst1q_u16(dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const uint8x16_t a_reg = vld1q_u8(a);
+  const uint8x16_t b_reg = vld1q_u8(b);
+
+  uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg));
+  uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg));
+  dist_first = vmulq_u16(dist_first, dist_first);
+  dist_second = vmulq_u16(dist_second, dist_second);
+
+  vst1q_u16(dst, dist_first);
+  vst1q_u16(dst + 8, dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) {
+  *dist_reg = vld1q_u16(dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first,
+                                uint16x8_t *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE uint16x8_t average_8(uint16x8_t sum,
+                                   const uint16x8_t *mul_constants,
+                                   const int strength, const int rounding,
+                                   const uint16x8_t *weight) {
+  const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16);
+  const uint16x8_t weight_u16 = *weight;
+  const uint16x8_t sixteen = vdupq_n_u16(16);
+  const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16);
+
+  // modifier * 3 / index;
+  uint32x4_t sum_hi =
+      vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants));
+  uint32x4_t sum_lo =
+      vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants));
+
+  sum_lo = vqaddq_u32(sum_lo, rounding_u32);
+  sum_hi = vqaddq_u32(sum_hi, rounding_u32);
+
+  // we cannot use vshrn_n_u32 as strength is not known at compile time.
+  sum_lo = vshlq_u32(sum_lo, strength_u32);
+  sum_hi = vshlq_u32(sum_hi, strength_u32);
+
+  sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo));
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = vminq_u16(sum, sixteen);
+  sum = vsubq_u16(sixteen, sum);
+  return vmulq_u16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const uint16x8_t sum_u16,
+                                   const uint8_t *pred, uint16_t *count,
+                                   uint32_t *accumulator) {
+  uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred));
+  uint16x8_t count_u16 = vld1q_u16(count);
+  uint32x4_t accum_0_u32, accum_1_u32;
+
+  count_u16 = vqaddq_u16(count_u16, sum_u16);
+  vst1q_u16(count, count_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+
+  accum_0_u32 =
+      vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16));
+  accum_1_u32 =
+      vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16));
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16,
+                                           const uint16x8_t sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
+  uint8x16_t pred_u8 = vld1q_u8(pred);
+  uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8));
+  uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8));
+  uint16x8_t count_0_u16 = vld1q_u16(count);
+  uint16x8_t count_1_u16 = vld1q_u16(count + 8);
+  uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16);
+  vst1q_u16(count, count_0_u16);
+  count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16);
+  vst1q_u16(count + 8, count_1_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+  accum_2_u32 = vld1q_u32(accumulator + 8);
+  accum_3_u32 = vld1q_u32(accumulator + 12);
+
+  accum_0_u32 =
+      vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16));
+  accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16),
+                          vget_high_u16(pred_0_u16));
+  accum_2_u32 =
+      vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16));
+  accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16),
+                          vget_high_u16(pred_1_u16));
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+  vst1q_u32(accumulator + 8, accum_2_u32);
+  vst1q_u32(accumulator + 12, accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) {
+  uint16x8_t dist_reg, dist_left, dist_right;
+
+  dist_reg = vld1q_u16(y_dist);
+  dist_left = vld1q_u16(y_dist - 1);
+  dist_right = vld1q_u16(y_dist + 1);
+
+  *sum = vqaddq_u16(dist_reg, dist_left);
+  *sum = vqaddq_u16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first,
+                              uint16x8_t *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           uint16x8_t *u_first,
+                                           uint16x8_t *u_second,
+                                           uint16x8_t *v_first,
+                                           uint16x8_t *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    uint16x8_t u_reg, v_reg;
+    uint16x8x2_t pair;
+
+    read_dist_8(u_dist, &u_reg);
+
+    pair = vzipq_u16(u_reg, u_reg);
+    *u_first = pair.val[0];
+    *u_second = pair.val[1];
+
+    read_dist_8(v_dist, &v_reg);
+
+    pair = vzipq_u16(v_reg, v_reg);
+    *v_first = pair.val[0];
+    *v_second = pair.val[1];
+  }
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 uint16x8_t *u_mod,
+                                                 uint16x8_t *v_mod) {
+  uint16x8_t y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      uint16x8_t y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = vqaddq_u16(y_reg, y_tmp);
+    }
+  } else {
+    uint16x8_t y_first, y_second;
+    uint32x4_t y_first32, y_second32;
+
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      uint16x8_t y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = vqaddq_u16(y_first, y_tmp_0);
+      y_second = vqaddq_u16(y_second, y_tmp_1);
+    }
+
+    y_first32 = vpaddlq_u16(y_first);
+    y_second32 = vpaddlq_u16(y_second);
+
+    y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32));
+  }
+
+  *u_mod = vqaddq_u16(*u_mod, y_reg);
+  *v_mod = vqaddq_u16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_luma_16(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  uint16x8_t weight_first, weight_second;
+
+  uint16x8_t mul_first, mul_second;
+
+  uint16x8_t sum_row_1_first, sum_row_1_second;
+  uint16x8_t sum_row_2_first, sum_row_2_second;
+  uint16x8_t sum_row_3_first, sum_row_3_second;
+
+  uint16x8_t u_first, u_second;
+  uint16x8_t v_first, v_second;
+
+  uint16x8_t sum_row_first;
+  uint16x8_t sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(block_width == 16);
+  (void)block_width;
+
+  // Initialize the weights
+  if (blk_fw) {
+    weight_first = vdupq_n_u16(blk_fw[0]);
+    weight_second = vdupq_n_u16(blk_fw[1]);
+  } else {
+    weight_first = vdupq_n_u16(top_weight);
+    weight_second = weight_first;
+  }
+
+  // First row
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = vqaddq_u16(sum_row_first, u_first);
+  sum_row_second = vqaddq_u16(sum_row_second, u_second);
+
+  sum_row_first = vqaddq_u16(sum_row_first, v_first);
+  sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        weight_first = vdupq_n_u16(blk_fw[2]);
+        weight_second = vdupq_n_u16(blk_fw[3]);
+      } else {
+        weight_first = vdupq_n_u16(bottom_weight);
+        weight_second = weight_first;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first);
+    sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = vqaddq_u16(sum_row_first, u_first);
+    sum_row_second = vqaddq_u16(sum_row_second, u_second);
+    sum_row_first = vqaddq_u16(sum_row_first, v_first);
+    sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+    sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                               &weight_second);
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = vqaddq_u16(sum_row_first, u_first);
+  sum_row_second = vqaddq_u16(sum_row_second, u_second);
+  sum_row_first = vqaddq_u16(sum_row_first, v_first);
+  sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void apply_temporal_filter_luma(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The block width is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usual left-middle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_chroma_8(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+
+  uint16x8_t weight;
+
+  uint16x8_t mul;
+
+  uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  uint16x8_t u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  // Initialize weight
+  if (blk_fw) {
+    weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1]));
+  } else {
+    weight = vdupq_n_u16(top_weight);
+  }
+
+  // First row
+  mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = vld1q_u16((const uint16_t *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3]));
+      } else {
+        weight = vdupq_n_u16(bottom_weight);
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void apply_temporal_filter_chroma(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_neon(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                             ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk,
+                             y_accum, y_count, y_dist_ptr, u_dist_ptr,
+                             v_dist_ptr);
+
+  apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width,
+                               block_height, ss_x, ss_y, strength, blk_fw_ptr,
+                               use_whole_blk, u_accum, u_count, v_accum,
+                               v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/encoder/x86/temporal_filter_constants.h b/vp9/encoder/vp9_temporal_filter_constants.h
similarity index 98%
rename from vp9/encoder/x86/temporal_filter_constants.h
rename to vp9/encoder/vp9_temporal_filter_constants.h
index 7dcedda192..8776dfc068 100644
--- a/vp9/encoder/x86/temporal_filter_constants.h
+++ b/vp9/encoder/vp9_temporal_filter_constants.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-#define VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
+#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
 #include "./vpx_config.h"
 
 // Division using multiplication and shifting. The C implementation does:
@@ -407,4 +407,4 @@ static const uint32_t
 
 #define DIST_STRIDE ((BW) + 2)
 
-#endif  // VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#endif  // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
index bcbf6d77e6..97f182c660 100644
--- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c
+++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/x86/temporal_filter_constants.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
 
 // Compute (a-b)**2 for 8 pixels with size 16-bit
 static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index 87e68fb438..7571bfccac 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/x86/temporal_filter_constants.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
 
 // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
 // difference squared, and store as unsigned 16-bit integer to dst.
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index cccaea712e..882c12d2e6 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -110,7 +110,9 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
@@ -120,6 +122,7 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
 endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
@@ -156,8 +159,10 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
-VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c

From e553e3acff6d2e894ce0400f15247aa9cca58719 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 7 Mar 2023 15:13:17 +0000
Subject: [PATCH 596/926] Add Neon implementation of vp9_highbd_block_error_c

Add Neon implementation of vp9_highbd_block_error_c as well as the
corresponding tests.

Change-Id: Ibe0eb077f959ced0dcd7d0d8d9d529d3b5bc1874
---
 test/vp9_block_error_test.cc                 | 22 ++++++---
 vp9/common/vp9_rtcd_defs.pl                  |  2 +-
 vp9/encoder/arm/neon/vp9_highbd_error_neon.c | 49 ++++++++++++++++++++
 vp9/vp9cx.mk                                 |  3 ++
 4 files changed, 69 insertions(+), 7 deletions(-)
 create mode 100644 vp9/encoder/arm/neon/vp9_highbd_error_neon.c

diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc
index bde84cd619..9e9595ebe9 100644
--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@@ -199,10 +199,20 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, BlockErrorTest,
-    ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>,
-                                 &BlockError8BitWrapper<vp9_block_error_c>,
-                                 VPX_BITS_8)));
-#endif
+const BlockErrorParam neon_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_10),
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_12),
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_8),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>,
+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest,
+                         ::testing::ValuesIn(neon_block_error_tests));
+#endif  // HAVE_NEON
 }  // namespace
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 2f9870dd48..556b361418 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -139,7 +139,7 @@ ()
   specialize qw/vp9_block_error neon avx2 sse2/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error sse2/;
+  specialize qw/vp9_highbd_block_error neon sse2/;
 } else {
   specialize qw/vp9_block_error neon avx2 msa sse2/;
 }
diff --git a/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
new file mode 100644
index 0000000000..d9b183472d
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+  const int shift = 2 * (bd - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int32x4_t c = load_tran_low_to_s32q(coeff);
+    const int32x4_t d = load_tran_low_to_s32q(dqcoeff);
+
+    const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+    err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+    err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+    ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+    ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+    coeff += 4;
+    dqcoeff += 4;
+    block_size -= 4;
+  } while (block_size != 0);
+
+  *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift;
+  return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift;
+}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index cccaea712e..f21036357e 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -139,6 +139,9 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c
+endif
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
 

From f7dbd848e43c0161b8680f799d908d7adb4dd188 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 7 Mar 2023 17:04:31 +0000
Subject: [PATCH 597/926] Optimize vpx_satd_neon

Optimize Neon implementation of vpx_satd by using ABD and UADALP instead
of ABAL and ABAL2, splitting the accumulator and using a dedicated
helper function to perform the final reduction.

Change-Id: Idcfa49e001b68b1dcd87c13fd9acc317a208cd2a
---
 vpx_dsp/arm/avg_neon.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 8e57bdaa50..56d97e22ad 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -46,29 +46,25 @@ uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
+// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
 int vpx_satd_neon(const tran_low_t *coeff, int length) {
-  const int16x4_t zero = vdup_n_s16(0);
-  int32x4_t accum = vdupq_n_s32(0);
+  int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
 
   do {
-    const int16x8_t src0 = load_tran_low_to_s16q(coeff);
-    const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8);
-    accum = vabal_s16(accum, vget_low_s16(src0), zero);
-    accum = vabal_s16(accum, vget_high_s16(src0), zero);
-    accum = vabal_s16(accum, vget_low_s16(src8), zero);
-    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    int16x8_t abs0, abs1;
+    const int16x8_t s0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8);
+
+    abs0 = vabsq_s16(s0);
+    sum_s32[0] = vpadalq_s16(sum_s32[0], abs0);
+    abs1 = vabsq_s16(s1);
+    sum_s32[1] = vpadalq_s16(sum_s32[1], abs1);
+
     length -= 16;
     coeff += 16;
   } while (length != 0);
 
-  {
-    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int satd = vget_lane_s32(s1, 0);
-    return satd;
-  }
+  return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
 }
 
 void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,

From be84aa14dc3d7b1eae3bab9bf060eabadd84196d Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 8 Mar 2023 12:01:04 +0000
Subject: [PATCH 598/926] Add Neon implementation of vpx_highbd_satd_c

Add Neon implementation of vpx_highbd_satd_c as well as the
corresponding tests.

Change-Id: I3d50e6abdf168fb13743e7d8da9364f072308b7f
---
 test/avg_test.cc              |  9 ++++++++
 vpx_dsp/arm/highbd_avg_neon.c | 40 +++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk            |  1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  2 +-
 4 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 vpx_dsp/arm/highbd_avg_neon.c

diff --git a/test/avg_test.cc b/test/avg_test.cc
index bcf8d0d993..dd84403324 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -694,6 +694,15 @@ INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
                                            make_tuple(256, &vpx_satd_neon),
                                            make_tuple(1024, &vpx_satd_neon)));
 
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdHighbdTest,
+    ::testing::Values(make_tuple(16, &vpx_highbd_satd_neon),
+                      make_tuple(64, &vpx_highbd_satd_neon),
+                      make_tuple(256, &vpx_highbd_satd_neon),
+                      make_tuple(1024, &vpx_highbd_satd_neon)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, BlockErrorTestFP,
     ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000000..3ba58b8005
--- /dev/null
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+// coeff: 32 bits, dynamic range [-2147483648, 2147483647].
+// length: value range {16, 64, 256, 1024}.
+// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
+int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
+  int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    int32x4_t abs0, abs1;
+    const int32x4_t s0 = load_tran_low_to_s32q(coeff);
+    const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4);
+
+    abs0 = vabsq_s32(s0);
+    sum_s64[0] = vpadalq_s32(sum_s64[0], abs0);
+    abs1 = vabsq_s32(s1);
+    sum_s64[1] = vpadalq_s32(sum_s64[1], abs1);
+
+    length -= 8;
+    coeff += 8;
+  } while (length != 0);
+
+  return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index ab8e5bd817..207cda6310 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -344,6 +344,7 @@ DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_hadamard_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_avg_neon.c
 endif
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 DSP_SRCS-$(HAVE_LSX)   += loongarch/avg_lsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 3baf16cc8b..2a01ec1b54 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -821,7 +821,7 @@ ()
     specialize qw/vpx_satd avx2 sse2 neon/;
 
     add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length";
-    specialize qw/vpx_highbd_satd avx2/;
+    specialize qw/vpx_highbd_satd avx2 neon/;
   } else {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
     specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";

From 362c69cfe565e68f240eb37ae05695c50b435656 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 8 Mar 2023 14:08:23 +0000
Subject: [PATCH 599/926] Optimize vpx_minmax_8x8_neon for aarch64

Optimize vpx_minmax_8x8_neon on AArch64 targets by using the UMAXV and
UMINV instructions - computing the maximum and minimum elements in a
Neon vector.

Change-Id: I54c3a3a087d266f6774e6113e5947253df288a64
---
 vpx_dsp/arm/avg_neon.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 56d97e22ad..d48115dd01 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -210,11 +210,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
   const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
 
-  // Split to D and start doing pairwise.
+#if defined(__aarch64__)
+  *min = *max = 0;  // Clear high bits
+  *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+  *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
   uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
   uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
 
-  // Enough runs of vpmax/min propogate the max/min values to every position.
+  // Enough runs of vpmax/min propagate the max/min values to every position.
   ab_max = vpmax_u8(ab_max, ab_max);
   ab_min = vpmin_u8(ab_min, ab_min);
 
@@ -228,4 +233,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   // Store directly to avoid costly neon->gpr transfer.
   vst1_lane_u8((uint8_t *)max, ab_max, 0);
   vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
 }

From 5c2cd048a05d8d06777cb9af5c0f4a261456023b Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Tue, 14 Mar 2023 16:50:31 +0530
Subject: [PATCH 600/926] Add AVX2 for convolve horizontal filter for block
 width 4

Introduced AVX2 intrinsic to compute convolve horizontal for
w = 4 case. This is a bit-exact change.

                 Instruction Count
cpu   Resolution   Reduction(%)
 0       LOWRES2      0.763
 0       MIDRES2      0.466
 0        HDRES2      0.317
 0       Average      0.516

Change-Id: I124f3f8e994c24461812f4963b113819466db44f
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 152 +++++++++++++++++++++-
 1 file changed, 149 insertions(+), 3 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 141614e7ad..37ef59f36c 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -39,6 +39,12 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
 #define CALC_CONVOLVE8_HORZ_ROW                                               \
   srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);             \
   s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]);                               \
@@ -1036,18 +1042,158 @@ static void vpx_filter_block1d8_v8_avx2(
   } while (y > 1);
 }
 
+static void vpx_filter_block1d4_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64_256bit;
+  unsigned int y = output_height;
+
+  assert(output_height > 1);
+
+  addFilterReg64_256bit = _mm256_set1_epi16(32);
+
+  // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit)
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each)
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  {
+    ptrdiff_t src_stride;
+    __m256i filt1Reg, filt2Reg, firstFilters, secondFilters;
+    // have the same data in both lanes of a 256 bit register
+    // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0
+    // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each)
+    const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+
+    // duplicate only the first 32 bits
+    // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1
+    // f0|f3 f2 f1 f0|f3 f2 f1 f0
+    firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+    // duplicate only the second 32 bits
+    // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5
+    // f4|f7 f6 f5 f4|f7 f6 f5 f4
+    secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+    // s6  s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
+    // s2 s4 s3 s2 s1 s3 s2 s1 s0
+    filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+
+    // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7
+    // s6 s8 s7 s6 s5 s7 s6 s5 s4
+    filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+    // multiple the size of the source and destination stride by two
+    src_stride = src_pitch << 1;
+
+    do {
+      __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1;
+      // load the 2 strides of source
+      // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
+      // r06 r05 r04 r03 r02 r01 r00
+      srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);
+
+      // filter the source buffer
+      // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
+      // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00
+      srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+      // multiply 4 adjacent elements with the filter and add the result
+      // .....|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||.........
+      // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00
+      srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+      // filter the source buffer
+      // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010
+      // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+      srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+      // multiply 4 adjacent elements with the filter and add the result
+      // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010
+      // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+      srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+      srcRegFilt32b1_1 =
+          _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit);
+      srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+      srcRegFilt32b1_1 =
+          _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+      // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit)
+      srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+
+      // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit)
+      srcRegFilt32b1_1 =
+          _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+      src_ptr += src_stride;
+      // save first row 4 values
+      *((int *)&output_ptr[0]) =
+          _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1));
+      output_ptr += output_pitch;
+
+      // save second row 4 values
+      *((int *)&output_ptr[0]) =
+          _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+      output_ptr += output_pitch;
+
+      y = y - 2;
+    } while (y > 1);
+
+    // For remaining height
+    if (y > 0) {
+      __m128i srcReg1, srcRegFilt1_1, addFilterReg64;
+      __m128i srcRegFilt2;
+
+      addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+
+      srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+      // filter the source buffer
+      srcRegFilt1_1 =
+          _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+      // multiply 4 adjacent elements with the filter and add the result
+      srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                                        _mm256_castsi256_si128(firstFilters));
+
+      // filter the source buffer
+      srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+      // multiply 4 adjacent elements with the filter and add the result
+      srcRegFilt2 =
+          _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+      srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+      srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+      // shift by 6 bit each 16 bit
+      srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+      srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+
+      // shrink to 8 bit each 16 bits, the first lane contain the first
+      // convolve result and the second lane contain the second convolve result
+      srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+      // save 4 bytes
+      *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+    }
+  }
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
-#else  // VPX_ARCH_X86
+#else   // VPX_ARCH_X86
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;

From d67a0021e798bdda8917a787832caf7038be56d0 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 16 Mar 2023 13:37:56 -0700
Subject: [PATCH 601/926] Update the sample code for VP9RateControlRTC

Update the sample code to the current VP9RateControlRTC interface.

Change-Id: I30b0712c897f93fd62ebce51ce39afce3cac1fd7
---
 vp9/ratectrl_rtc.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index 3131c22231..a846f0742d 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -69,22 +69,21 @@ struct VP9SegmentationData {
 // the encoder. To use this interface, you need to link with libvpxrc.a.
 //
 // #include "vp9/ratectrl_rtc.h"
-// VP9RateControlRTC rc_api;
 // VP9RateControlRtcConfig cfg;
 // VP9FrameParamsQpRTC frame_params;
 //
 // YourFunctionToInitializeConfig(cfg);
-// rc_api.InitRateControl(cfg);
+// std::unique_ptr<VP9RateControlRTC> rc_api = VP9RateControlRTC::Create(cfg);
 // // start encoding
 // while (frame_to_encode) {
 //   if (config_changed)
-//     rc_api.UpdateRateControl(cfg);
+//     rc_api->UpdateRateControl(cfg);
 //   YourFunctionToFillFrameParams(frame_params);
-//   rc_api.ComputeQP(frame_params);
-//   YourFunctionToUseQP(rc_api.GetQP());
-//   YourFunctionToUseLoopfilter(rc_api.GetLoopfilterLevel());
+//   rc_api->ComputeQP(frame_params);
+//   YourFunctionToUseQP(rc_api->GetQP());
+//   YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel());
 //   // After encoding
-//   rc_api.PostEncode(encoded_frame_size);
+//   rc_api->PostEncode(encoded_frame_size, frame_params);
 // }
 class VP9RateControlRTC {
  public:

From d6b6f85063772f25624c6429cfa414b8d48b9c0e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 16 Mar 2023 15:21:49 -0700
Subject: [PATCH 602/926] Remove repeated field from VP9RateControlRtcConfig

Remove the `ts_number_layers` field from VP9RateControlRtcConfig because
the base class VpxRateControlRtcConfig already has that field.

Note: In commit 65a1751e5b98bf7f1d21bcbfdef352af34fb205d,
`ts_number_layers` was moved to the newly created base class
VpxRateControlRtcConfig but was inadvertently left in
VP9RateControlRtcConfig:
https://chromium-review.googlesource.com/c/webm/libvpx/+/3140048,

Change-Id: I98d48e152683ec2e5e62efffb56b7f010c5d0695
---
 vp9/ratectrl_rtc.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index a846f0742d..a82c776f94 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -44,8 +44,6 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
 
   // Number of spatial layers
   int ss_number_layers;
-  // Number of temporal layers
-  int ts_number_layers;
   int max_quantizers[VPX_MAX_LAYERS];
   int min_quantizers[VPX_MAX_LAYERS];
   int scaling_factor_num[VPX_SS_MAX_LAYERS];

From d92681b06ff7b0a51c51cbc012781851afa0701e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 16 Mar 2023 18:36:13 -0700
Subject: [PATCH 603/926] Set oxcf->ts_rate_decimator[tl] only once

The code that sets oxcf->ts_rate_decimator[tl] does not need to be
inside a loop that iterates over sl. Move the code out of the sl loop so
that oxcf->ts_rate_decimator[tl] is set only once.

Change-Id: I22f6c117d200ec38a757b749a8700660d15436c1
---
 vp9/ratectrl_rtc.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 8592173fb6..cc12ea336a 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -136,6 +136,9 @@ void VP9RateControlRTC::UpdateRateControl(
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
   vp9_set_mb_mi(cm, cm->width, cm->height);
+  for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
+    oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
+  }
   for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
     for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
       const int layer =
@@ -149,7 +152,6 @@ void VP9RateControlRTC::UpdateRateControl(
       lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]);
       lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl];
       lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl];
-      oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
     }
   }
   vp9_set_rc_buffer_sizes(cpi_);

From 430c6c1553df0abfe2dadc480b21dd691a98140f Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 16 Mar 2023 13:30:01 -0700
Subject: [PATCH 604/926] Change UpdateRateControl() to return bool

Change the VP9RateControlRtcConfig constructor to initialize
ss_number_layers (to 1).

Change UpdateRateControl() to return bool so that it can report failure
(due to invalid configuration).

Also change InitRateControl() to return bool to propagate the return
value of UpdateRateControl().

Note: This is a port of the libaom CL
https://aomedia-review.googlesource.com/c/aom/+/172042.

Change-Id: I90b60353b5f15692dba5d89e7b1a9c81bb2fdd89
---
 test/vp8_ratectrl_rtc_test.cc |  8 ++++----
 test/vp9_ratectrl_rtc_test.cc |  6 +++---
 vp8/vp8_ratectrl_rtc.cc       | 15 +++++++++++----
 vp8/vp8_ratectrl_rtc.h        |  4 ++--
 vp9/ratectrl_rtc.cc           | 30 ++++++++++++++++++------------
 vp9/ratectrl_rtc.h            |  5 +++--
 6 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index 56c26a99f4..b76bcae11d 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -160,7 +160,7 @@ class Vp8RcInterfaceTest
     if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfig();
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
-    rc_api_->UpdateRateControl(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
 
     ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
                                          test_video_.height, 30, 1, 0,
@@ -177,7 +177,7 @@ class Vp8RcInterfaceTest
     key_interval_ = 100;
     SetConfig();
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
-    rc_api_->UpdateRateControl(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
 
     ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
                                          test_video_.height, 30, 1, 0,
@@ -193,7 +193,7 @@ class Vp8RcInterfaceTest
     if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfigTemporalLayers(2);
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
-    rc_api_->UpdateRateControl(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
 
     ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
                                          test_video_.height, 30, 1, 0,
@@ -209,7 +209,7 @@ class Vp8RcInterfaceTest
     if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfigTemporalLayers(3);
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
-    rc_api_->UpdateRateControl(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
 
     ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
                                          test_video_.height, 30, 1, 0,
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index c6ab5b034f..5abda1290a 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -212,7 +212,7 @@ class RcInterfaceSvcTest
         rc_cfg_.layer_target_bitrate[6] = 0;
         rc_cfg_.layer_target_bitrate[7] = 0;
         rc_cfg_.layer_target_bitrate[8] = 0;
-        rc_api_->UpdateRateControl(rc_cfg_);
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
       } else if (video->frame() == 200) {
         // Go down to 1 spatial layer.
         // Update the encoder config.
@@ -226,7 +226,7 @@ class RcInterfaceSvcTest
         rc_cfg_.layer_target_bitrate[3] = 0;
         rc_cfg_.layer_target_bitrate[4] = 0;
         rc_cfg_.layer_target_bitrate[5] = 0;
-        rc_api_->UpdateRateControl(rc_cfg_);
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
       } else if (0 && video->frame() == 280) {
         // TODO(marpan): Re-enable this going back up when issue is fixed.
         // Go back up to 3 spatial layers.
@@ -235,7 +235,7 @@ class RcInterfaceSvcTest
         encoder->Config(&cfg_);
         // Update the RC config.
         SetRCConfigSvc(3, 3);
-        rc_api_->UpdateRateControl(rc_cfg_);
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
       }
     }
   }
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index c36cfea485..65c58536aa 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -62,7 +62,7 @@ std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
   if (!rc_api->cpi_) return nullptr;
   vp8_zero(*rc_api->cpi_);
 
-  rc_api->InitRateControl(cfg);
+  if (!rc_api->InitRateControl(cfg)) return nullptr;
 
   return rc_api;
 }
@@ -74,7 +74,7 @@ VP8RateControlRTC::~VP8RateControlRTC() {
   }
 }
 
-void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
+bool VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
   VP8_COMMON *cm = &cpi_->common;
   VP8_CONFIG *oxcf = &cpi_->oxcf;
   oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
@@ -92,13 +92,19 @@ void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
   cpi_->kf_bitrate_adjustment = 0;
   cpi_->gf_overspend_bits = 0;
   cpi_->non_gf_bitrate_adjustment = 0;
-  UpdateRateControl(rc_cfg);
+  if (!UpdateRateControl(rc_cfg)) return false;
   cpi_->buffer_level = oxcf->starting_buffer_level;
   cpi_->bits_off_target = oxcf->starting_buffer_level;
+  return true;
 }
 
-void VP8RateControlRTC::UpdateRateControl(
+bool VP8RateControlRTC::UpdateRateControl(
     const VP8RateControlRtcConfig &rc_cfg) {
+  if (rc_cfg.ts_number_layers < 1 ||
+      rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS) {
+    return false;
+  }
+
   VP8_COMMON *cm = &cpi_->common;
   VP8_CONFIG *oxcf = &cpi_->oxcf;
   const unsigned int prev_number_of_layers = oxcf->number_of_layers;
@@ -199,6 +205,7 @@ void VP8RateControlRTC::UpdateRateControl(
 
   vp8_new_framerate(cpi_, cpi_->framerate);
   vpx_clear_system_state();
+  return true;
 }
 
 void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index 0e81592eca..a8a886c56e 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -39,7 +39,7 @@ class VP8RateControlRTC {
       const VP8RateControlRtcConfig &cfg);
   ~VP8RateControlRTC();
 
-  void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
+  bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
   // int GetLoopfilterLevel() const;
@@ -49,7 +49,7 @@ class VP8RateControlRTC {
 
  private:
   VP8RateControlRTC() {}
-  void InitRateControl(const VP8RateControlRtcConfig &cfg);
+  bool InitRateControl(const VP8RateControlRtcConfig &cfg);
   struct VP8_COMP *cpi_;
   int q_;
 };
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index cc12ea336a..29033d4ba5 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -25,22 +25,16 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
                                                 VP9RateControlRTC());
   if (!rc_api) return nullptr;
   rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
-  if (!rc_api->cpi_) {
-    rc_api.reset();
-    return nullptr;
-  }
+  if (!rc_api->cpi_) return nullptr;
   vp9_zero(*rc_api->cpi_);
 
-  rc_api->InitRateControl(cfg);
+  if (!rc_api->InitRateControl(cfg)) return nullptr;
   if (cfg.aq_mode) {
     VP9_COMP *const cpi = rc_api->cpi_;
     cpi->segmentation_map = static_cast<uint8_t *>(
         vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
                    sizeof(*cpi->segmentation_map)));
-    if (!cpi->segmentation_map) {
-      rc_api.reset();
-      return nullptr;
-    }
+    if (!cpi->segmentation_map) return nullptr;
     cpi->cyclic_refresh =
         vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
     cpi->cyclic_refresh->content_mode = 0;
@@ -71,7 +65,7 @@ VP9RateControlRTC::~VP9RateControlRTC() {
   }
 }
 
-void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
+bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   VP9_COMMON *cm = &cpi_->common;
   VP9EncoderConfig *oxcf = &cpi_->oxcf;
   RATE_CONTROL *const rc = &cpi_->rc;
@@ -88,7 +82,7 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   cm->current_video_frame = 0;
   rc->kf_boost = DEFAULT_KF_BOOST;
 
-  UpdateRateControl(rc_cfg);
+  if (!UpdateRateControl(rc_cfg)) return false;
   vp9_set_mb_mi(cm, cm->width, cm->height);
 
   cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 ||
@@ -102,10 +96,21 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   vp9_rc_init(oxcf, 0, rc);
   rc->constrain_gf_key_freq_onepass_vbr = 0;
   cpi_->sf.use_nonrd_pick_mode = 1;
+  return true;
 }
 
-void VP9RateControlRTC::UpdateRateControl(
+bool VP9RateControlRTC::UpdateRateControl(
     const VP9RateControlRtcConfig &rc_cfg) {
+  // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5)
+  // and VPX_TS_MAX_LAYERS (5), check all three.
+  if (rc_cfg.ss_number_layers < 1 ||
+      rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS ||
+      rc_cfg.ts_number_layers < 1 ||
+      rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS ||
+      rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) {
+    return false;
+  }
+
   VP9_COMMON *cm = &cpi_->common;
   VP9EncoderConfig *oxcf = &cpi_->oxcf;
   RATE_CONTROL *const rc = &cpi_->rc;
@@ -163,6 +168,7 @@ void VP9RateControlRTC::UpdateRateControl(
                                            (int)cpi_->oxcf.target_bandwidth);
   }
   vp9_check_reset_rc_flag(cpi_);
+  return true;
 }
 
 void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index a82c776f94..7f3c900459 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -30,6 +30,7 @@ namespace libvpx {
 struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
  public:
   VP9RateControlRtcConfig() {
+    ss_number_layers = 1;
     vp9_zero(max_quantizers);
     vp9_zero(min_quantizers);
     vp9_zero(scaling_factor_den);
@@ -89,7 +90,7 @@ class VP9RateControlRTC {
       const VP9RateControlRtcConfig &cfg);
   ~VP9RateControlRTC();
 
-  void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
+  bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
   int GetLoopfilterLevel() const;
@@ -101,7 +102,7 @@ class VP9RateControlRTC {
 
  private:
   VP9RateControlRTC() {}
-  void InitRateControl(const VP9RateControlRtcConfig &cfg);
+  bool InitRateControl(const VP9RateControlRtcConfig &cfg);
   struct VP9_COMP *cpi_;
 };
 

From 02fd7d6aeb1ea6d8eeef17315a6a7c4ffa6d7352 Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Sat, 12 Nov 2022 08:23:17 +0900
Subject: [PATCH 605/926] Reland "quantize: simplifly highbd 32x32_b args"

This is a reland of commit 573f5e662b544dbc553d73fa2b61055c30dfe8cc

Alignment issue with tests fixed in crrev.com/c/webm/libvpx/+/4305500

Original change's description:
> quantize: simplify highbd 32x32_b args
>
> Change-Id: I431a41279c4c4193bc70cfe819da6ea7e1d2fba1

Change-Id: Ic868b6f987c99d88672858fedd092fa49c125e19
---
 test/vp9_quantize_test.cc                 | 54 +++++++++++------------
 vp9/encoder/vp9_encodemb.c                | 10 ++---
 vpx_dsp/arm/highbd_quantize_neon.c        | 21 +++++----
 vpx_dsp/quantize.c                        | 16 ++++---
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  2 +-
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 13 +++---
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 21 ++++-----
 7 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 6a8f1dafb1..bff2fa59a4 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -557,15 +557,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
                    &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
                    false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false)));
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
@@ -634,15 +634,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false)));
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
@@ -680,15 +680,15 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
-                   32, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
-                   32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4910dc20f5..6a5f628808 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -511,9 +511,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(
-            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                    scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
@@ -856,9 +855,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(
-              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
+                                      eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index b9f72a94c5..3b1fec3321 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
     const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
@@ -224,11 +225,9 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
 }
 
 void vpx_highbd_quantize_b_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
@@ -237,12 +236,13 @@ void vpx_highbd_quantize_b_32x32_neon(
   // High half has identical elements, but we can reconstruct it from the low
   // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
   // vector
-  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1);
-  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1);
+  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
+  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
   // Extend the quant, quant_shift vectors to ones of 32-bit elements
   // scale to high-half, so we can use vqdmulhq_s32
-  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
-  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16);
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
   int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
 
   // Process first 8 values which include a dc component.
@@ -300,8 +300,7 @@ void vpx_highbd_quantize_b_32x32_neon(
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
-  // Need these here, else the compiler complains about mixing declarations and
+  // Need this here, else the compiler complains about mixing declarations and
   // code in C90
-  (void)n_coeffs;
   (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 212db45c88..c4642812ad 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -272,14 +272,16 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+  const intptr_t n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   int idx = 0;
   int idx_arr[1024];
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 2a01ec1b54..ab86b9cc7c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -731,7 +731,7 @@ ()
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index 8edddd637f..6041d7289a 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,6 +11,7 @@
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
   const __m128i sign = _mm_srai_epi16(*p, 15);
@@ -222,17 +223,17 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
 }
 
 void vpx_highbd_quantize_b_32x32_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   const unsigned int step = 8;
+  intptr_t n_coeffs = 32 * 32;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
   (void)scan;
 
-  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+  init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
+          mb_plane->quant_shift, qp, 1);
 
   quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
 
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index ae1981a834..6a8f42b8a4 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vp9/encoder/vp9_block.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
@@ -93,18 +94,17 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = 0;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  const intptr_t n_coeffs = 32 * 32;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
   (void)scan;
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
@@ -140,10 +140,11 @@ void vpx_highbd_quantize_b_32x32_sse2(
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+        (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

From c6da2329b90fc39ea48b99aff0b0468c1fdffa6c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 9 Mar 2023 21:04:07 +0000
Subject: [PATCH 606/926] Add tests for vpx_highbd_minmax_8x8_c

Write tests for vpx_highbd_minmax_8x8_c, and fix initial value of min in
vpx_highbd_minmax_8x8_c.

Change-Id: I1f127df945bbb8c7d373c5430ff5f94f28575968
---
 test/minmax_test.cc | 109 ++++++++++++++++++++++++++++++++++++++++++++
 vpx_dsp/avg.c       |   2 +-
 2 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index 12327bc188..663b359c5d 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -15,6 +15,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
@@ -115,7 +116,115 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+
+using HBDMinMaxTest = MinMaxTest;
+
+void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int *min_ret, int *max_ret) {
+  int min = 65535;
+  int max = 0;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b);
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(HBDMinMaxTest, MinValue) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int i = 0; i < 64; i++) {
+    vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+    vpx_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64);
+    CONVERT_TO_SHORTPTR(b)[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(65535, max);
+    EXPECT_EQ(i, min);
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, MaxValue) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int i = 0; i < 64; i++) {
+    vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+    vpx_memset16(CONVERT_TO_SHORTPTR(b), 0, 64);
+    CONVERT_TO_SHORTPTR(b)[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, CompareReference) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int j = 0; j < 64; j++) {
+    CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16();
+    CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16();
+  }
+
+  int min_ref, max_ref, min, max;
+  highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t))));
+  for (int i = 0; i < 8 * 64; i++) {
+    CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16();
+    CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+    }
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+#endif
+
 INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest,
+                         ::testing::Values(&vpx_highbd_minmax_8x8_c));
+#endif
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest,
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 9540154074..391e9eb144 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -428,7 +428,7 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
   int i, j;
   const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
   const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
-  *min = 255;
+  *min = 65535;
   *max = 0;
   for (i = 0; i < 8; ++i, s += p, d += dp) {
     for (j = 0; j < 8; ++j) {

From fff4e76b55900767083434248c67c3e041bab97b Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 9 Mar 2023 13:58:16 +0000
Subject: [PATCH 607/926] Add Neon implementation of vpx_highbd_minmax_8x8_c

Add Neon implementation of vpx_highbd_minmax_8x8_c as well as the
corresponding tests.

Change-Id: I5d9444a239fb1baa53634c1bdb5292b44067d90c
---
 test/minmax_test.cc           |  4 ++
 vpx_dsp/arm/highbd_avg_neon.c | 77 +++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  1 +
 3 files changed, 82 insertions(+)

diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index 663b359c5d..e710af6991 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -234,6 +234,10 @@ INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest,
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest,
                          ::testing::Values(&vpx_minmax_8x8_neon));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest,
+                         ::testing::Values(&vpx_highbd_minmax_8x8_neon));
+#endif
 #endif
 
 #if HAVE_MSA
diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
index 3ba58b8005..b84a7875d4 100644
--- a/vpx_dsp/arm/highbd_avg_neon.c
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -38,3 +38,80 @@ int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
 
   return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
 }
+
+void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride, int *min,
+                                int *max) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b);
+
+  const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * a_stride);
+  const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * a_stride);
+  const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * a_stride);
+  const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * a_stride);
+  const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * a_stride);
+  const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * a_stride);
+  const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * a_stride);
+  const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * a_stride);
+
+  const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * b_stride);
+  const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * b_stride);
+  const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * b_stride);
+  const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * b_stride);
+  const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * b_stride);
+  const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * b_stride);
+  const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * b_stride);
+  const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * b_stride);
+
+  const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+  const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+  const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+  const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+  const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+  const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+  const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+  const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+  const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+  const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+  const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+  const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t min0123 = vminq_u16(min01, min23);
+  const uint16x8_t min4567 = vminq_u16(min45, min67);
+  const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if defined(__aarch64__)
+  *min = *max = 0;  // Clear high bits
+  *((uint16_t *)max) = vmaxvq_u16(max07);
+  *((uint16_t *)min) = vminvq_u16(min07);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
+  uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+  uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+  // Enough runs of vpmax/min propagate the max/min values to every position.
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u16((uint16_t *)max, ab_max, 0);
+  vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 2a01ec1b54..2780333e88 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1000,6 +1000,7 @@ ()
   specialize qw/vpx_highbd_avg_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
+  specialize qw/vpx_highbd_minmax_8x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;

From f23f27bb807794fe5773d0343e6281527ad5f640 Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Mon, 14 Nov 2022 16:47:33 +0900
Subject: [PATCH 608/926] Reland "quantize: use scan_order instead of passing
 scan/iscan"

This is a reland of commit 14fc40040ff30486c45111056db44ee18590a24a

Parent change fixed in crrev.com/c/webm/libvpx/+/4305500

Original change's description:
> quantize: use scan_order instead of passing scan/iscan
>
> further reduces the arguments for the 32x32. This will be applied to the base
> version as well.
>
> Change-Id: I25a162b5248b14af53d9e20c6a7fa2a77028a6d1

Change-Id: I2a7654558eaddd68bd09336bf317b297f18559d2
---
 test/vp9_quantize_test.cc                 | 42 +++++++++++------------
 vp9/common/vp9_scan.h                     |  2 +-
 vp9/encoder/vp9_encodemb.c                |  8 ++---
 vpx_dsp/arm/highbd_quantize_neon.c        |  7 ++--
 vpx_dsp/arm/quantize_neon.c               |  7 ++--
 vpx_dsp/quantize.c                        |  9 ++---
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  5 +--
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c |  5 +--
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c |  5 +--
 vpx_dsp/x86/quantize_avx.c                |  7 ++--
 vpx_dsp/x86/quantize_avx2.c               |  5 +--
 vpx_dsp/x86/quantize_ssse3.c              |  6 ++--
 12 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index bff2fa59a4..e9b17d5eb8 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                              const macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
-                             const int16_t *scan, const int16_t *iscan);
+                             const struct scan_order *const scan_order);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
@@ -60,9 +60,10 @@ template <QuantizeBaseFunc fn>
 void QuantWrapper(const tran_low_t *coeff, intptr_t count,
                   const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                   tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                  const int16_t *scan, const int16_t *iscan) {
+                  const struct scan_order *const scan_order) {
   fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
-     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan,
+     scan_order->iscan);
 }
 
 // Wrapper for 32x32 version which does not use count
@@ -70,16 +71,16 @@ typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
                                   const macroblock_plane *const mb_plane,
                                   tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                   const int16_t *dequant, uint16_t *eob,
-                                  const int16_t *scan, const int16_t *iscan);
+                                  const struct scan_order *const scan_order);
 
 template <Quantize32x32Func fn>
 void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
                        const macroblock_plane *const mb_plane,
                        tran_low_t *qcoeff, tran_low_t *dqcoeff,
                        const int16_t *dequant, uint16_t *eob,
-                       const int16_t *scan, const int16_t *iscan) {
+                       const struct scan_order *const scan_order) {
   (void)count;
-  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
 }
 
 // Wrapper for FP version which does not use zbin or quant_shift.
@@ -93,9 +94,9 @@ template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
                     const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                     tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                    const int16_t *scan, const int16_t *iscan) {
+                    const struct scan_order *const scan_order) {
   fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
-     dequant, eob, scan, iscan);
+     dequant, eob, scan_order->scan, scan_order->iscan);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -232,8 +233,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 
 void VP9QuantizeTest::Run() {
   quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
-               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
-               scan_->iscan);
+               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_);
 }
 
 void VP9QuantizeTest::Speed(bool is_median) {
@@ -306,7 +306,7 @@ void VP9QuantizeTest::Speed(bool is_median) {
           ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
                            ref_qcoeff.TopLeftPixel(),
                            ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                           scan_->scan, scan_->iscan);
+                           scan_);
         }
         vpx_usec_timer_mark(&timer);
 
@@ -314,7 +314,7 @@ void VP9QuantizeTest::Speed(bool is_median) {
         for (int n = 0; n < kNumTests; ++n) {
           quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
                        qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-                       dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
+                       dequant_ptr_, &eob_, scan_);
         }
         vpx_usec_timer_mark(&simd_timer);
 
@@ -455,12 +455,11 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
                          quant_fp_ptr_);
     ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
                      ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+                     dequant_ptr_, &ref_eob, scan_);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
-                                          mb_plane_, qcoeff_.TopLeftPixel(),
-                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
-                                          &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -512,12 +511,11 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
                          quant_fp_ptr_);
     ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
                      ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+                     dequant_ptr_, &ref_eob, scan_);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
-                                          mb_plane_, qcoeff_.TopLeftPixel(),
-                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
-                                          &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 72a9a5ec47..efa0e23365 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -23,7 +23,7 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef struct {
+typedef struct scan_order {
   const int16_t *scan;
   const int16_t *iscan;
   const int16_t *neighbors;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 6a5f628808..515c7a9031 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -512,7 +512,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                                    scan_order->scan, scan_order->iscan);
+                                    scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
@@ -542,7 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                           scan_order->scan, scan_order->iscan);
+                           scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
@@ -856,7 +856,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
           vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
-                                      eob, scan_order->scan, scan_order->iscan);
+                                      eob, scan_order);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -946,7 +946,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
-                             scan_order->scan, scan_order->iscan);
+                             scan_order);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 3b1fec3321..5a40f1284e 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
@@ -227,10 +228,11 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
 void vpx_highbd_quantize_b_32x32_neon(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   // High half has identical elements, but we can reconstruct it from the low
@@ -300,7 +302,4 @@ void vpx_highbd_quantize_b_32x32_neon(
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
-  // Need this here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)scan;
 }
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index e81738a7bb..84b6d8c79f 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
@@ -218,10 +219,11 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
+                               const struct scan_order *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
@@ -285,7 +287,4 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
-  // Need these here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index c4642812ad..f51bf253e7 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,6 +14,7 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
@@ -213,7 +214,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
                             const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
+                            const struct scan_order *const scan_order) {
   const int n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
@@ -221,11 +222,11 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
   const int16_t *round_ptr = mb_plane->round;
   const int16_t *quant_ptr = mb_plane->quant;
   const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -274,7 +275,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
 void vpx_highbd_quantize_b_32x32_c(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
   const intptr_t n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
@@ -282,11 +283,11 @@ void vpx_highbd_quantize_b_32x32_c(
   const int16_t *round_ptr = mb_plane->round;
   const int16_t *quant_ptr = mb_plane->quant;
   const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 9853ba54a0..7cd3a0be89 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -19,6 +19,7 @@ ()
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_VP9_ENCODER
  struct macroblock_plane;
+ struct scan_order;
 #endif
 
 EOF
@@ -724,14 +725,14 @@ ()
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index 6041d7289a..bfd7b2e23e 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,6 +11,7 @@
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
@@ -225,12 +226,12 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
 void vpx_highbd_quantize_b_32x32_avx2(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
   const unsigned int step = 8;
   intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
-  (void)scan;
 
   init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
           mb_plane->quant_shift, qp, 1);
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 6a8f42b8a4..58d5a3a5ff 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -96,16 +97,16 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 void vpx_highbd_quantize_b_32x32_sse2(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) {
+    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = 0;
   const intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
   const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
-  (void)scan;
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index d52f6c6644..d05a937be1 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -19,6 +19,8 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -144,10 +146,11 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
                               const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct scan_order *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -156,8 +159,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-
   load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
                      &shift);
 
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index a8412c5b8e..1c82542ae6 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
@@ -255,11 +256,11 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
+                               const struct scan_order *const scan_order) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
                      mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 6fe54d7d98..6401b2865d 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,6 +16,7 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -112,9 +113,10 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
                                 const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct scan_order *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -123,8 +125,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-
   load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
                      &shift);
 

From cb5b047ad89d8183ca512a4b57eee816b18f76bf Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 13:22:08 -0800
Subject: [PATCH 609/926] vp9_setup_mask: clear -Wshadow warnings

Bug: webm:1793
Change-Id: If678fc195ef87cc634d31fb7b24e0c844a5cb7b0
---
 vp9/common/vp9_loopfilter.c | 38 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 765cb11726..1a9d45ae77 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -932,32 +932,32 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       break;
     default:
       for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
-        const int shift_y = shift_32_y[idx_32];
-        const int shift_uv = shift_32_uv[idx_32];
+        const int shift_y_32 = shift_32_y[idx_32];
+        const int shift_uv_32 = shift_32_uv[idx_32];
         const int mi_32_col_offset = ((idx_32 & 1) << 2);
         const int mi_32_row_offset = ((idx_32 >> 1) << 2);
         if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
           continue;
         switch (mip[0]->sb_type) {
           case BLOCK_32X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             break;
           case BLOCK_32X16:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             if (mi_32_row_offset + 2 >= max_rows) continue;
             mip2 = mip + mode_info_stride * 2;
-            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+            build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm);
             break;
           case BLOCK_16X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             if (mi_32_col_offset + 2 >= max_cols) continue;
             mip2 = mip + 2;
-            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+            build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm);
             break;
           default:
             for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
-              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
-              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int shift_y_16 = shift_y_32 + shift_16_y[idx_16];
+              const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16];
               const int mi_16_col_offset =
                   mi_32_col_offset + ((idx_16 & 1) << 1);
               const int mi_16_row_offset =
@@ -968,28 +968,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 
               switch (mip[0]->sb_type) {
                 case BLOCK_16X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   break;
                 case BLOCK_16X8:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   if (mi_16_row_offset + 1 >= max_rows) continue;
                   mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2[0], shift_y + 8, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm);
                   break;
                 case BLOCK_8X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   if (mi_16_col_offset + 1 >= max_cols) continue;
                   mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2[0], shift_y + 1, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm);
                   break;
                 default: {
-                  const int shift_y =
-                      shift_32_y[idx_32] + shift_16_y[idx_16] + shift_8_y[0];
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  const int shift_y_8_0 = shift_y_16 + shift_8_y[0];
+                  build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm);
                   mip += offset[0];
                   for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
-                    const int shift_y = shift_32_y[idx_32] +
-                                        shift_16_y[idx_16] + shift_8_y[idx_8];
+                    const int shift_y_8 = shift_y_16 + shift_8_y[idx_8];
                     const int mi_8_col_offset =
                         mi_16_col_offset + ((idx_8 & 1));
                     const int mi_8_row_offset =
@@ -998,7 +996,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                     if (mi_8_col_offset >= max_cols ||
                         mi_8_row_offset >= max_rows)
                       continue;
-                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                    build_y_mask(lfi_n, mip[0], shift_y_8, lfm);
                   }
                   break;
                 }

From 492f4c5538fd8c9af8e8e661bab29d985ebf2ae0 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 13:29:20 -0800
Subject: [PATCH 610/926] vp9_bitstream.c: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I8abac3c901ad24b642b39ea6e6081d8ba626853d
---
 vp9/encoder/vp9_bitstream.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index a84c8b524f..17c123af6f 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -169,8 +169,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
         vpx_write_bit(w, p->extra & 1);
       } else {  // t >= TWO_TOKEN && t < EOB_TOKEN
         const struct vp9_token *const a = &vp9_coef_encodings[t];
-        const int v = a->value;
-        const int n = a->len;
+        int v = a->value;
+        int n = a->len;
         const int e = p->extra;
         vpx_write(w, 1, context_tree[2]);
         vp9_write_tree(w, vp9_coef_con_tree,
@@ -179,8 +179,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
         if (t >= CATEGORY1_TOKEN) {
           const vp9_extra_bit *const b = &extra_bits[t];
           const unsigned char *pb = b->prob;
-          int v = e >> 1;
-          int n = b->len;  // number of bits in v, assumed nonzero
+          v = e >> 1;
+          n = b->len;  // number of bits in v, assumed nonzero
           do {
             const int bb = (v >> --n) & 1;
             vpx_write(w, bb, *pb++);
@@ -599,7 +599,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                const vpx_prob upd = DIFF_UPDATE_PROB;
                 int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)

From 405ae856664c1f8390a43d58dbbd96d1b572f095 Mon Sep 17 00:00:00 2001
From: Deepa K G <deepa.kg@ittiam.com>
Date: Thu, 2 Mar 2023 13:39:55 +0530
Subject: [PATCH 611/926] Refactor logic of skipping trellis coeff opt

The code to enable trellis coefficient optimization
is refactored using the sf 'trellis_opt_tx_rd'. This
change facilitates adaptive skipping of trellis
optimization based on block properties.

Change-Id: Ia1ff7cbbe5acf86414410f62655d46c099387847
---
 vp9/encoder/vp9_block.h          |   2 +-
 vp9/encoder/vp9_encodeframe.c    |  14 ++---
 vp9/encoder/vp9_encodemb.c       | 105 +++++++++++++++++++------------
 vp9/encoder/vp9_encodemb.h       |   5 +-
 vp9/encoder/vp9_encoder.h        |  19 ++++++
 vp9/encoder/vp9_rdopt.c          |  37 +++++++----
 vp9/encoder/vp9_speed_features.c |  15 +++--
 vp9/encoder/vp9_speed_features.h |  19 +++++-
 8 files changed, 147 insertions(+), 69 deletions(-)

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 3e2c9a3c35..116fc2447c 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -78,7 +78,7 @@ struct macroblock {
   int skip_recode;
   int skip_optimize;
   int q_index;
-  int block_qcoeff_opt;
+  double log_block_src_var;
   int block_tx_domain;
 
   // The equivalent error at the current rdmult of one whole bit (not one
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5b811016de..1d593cfc01 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2021,20 +2021,20 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
 
-  if ((cpi->sf.tx_domain_thresh > 0.0) || (cpi->sf.quant_opt_thresh > 0.0)) {
+  if ((cpi->sf.tx_domain_thresh > 0.0) ||
+      (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) {
     double logvar = vp9_log_block_var(cpi, x, bsize);
-    // Check block complexity as part of descision on using pixel or transform
+    // Check block complexity as part of decision on using pixel or transform
     // domain distortion in rd tests.
     x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion &&
                          (logvar >= cpi->sf.tx_domain_thresh);
 
-    // Check block complexity as part of descision on using quantized
-    // coefficient optimisation inside the rd loop.
-    x->block_qcoeff_opt =
-        cpi->sf.allow_quant_coeff_opt && (logvar <= cpi->sf.quant_opt_thresh);
+    // Store block complexity to decide on using quantized coefficient
+    // optimization inside the rd loop.
+    x->log_block_src_var = logvar;
   } else {
     x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion;
-    x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt;
+    x->log_block_src_var = 0.0;
   }
 
   set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9dcf..7c61419f8b 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -26,6 +26,7 @@
 #include "vp9/common/vp9_scan.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
@@ -759,10 +760,19 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
   MODE_INFO *mi = xd->mi[0];
   int plane;
 #if CONFIG_MISMATCH_DEBUG
-  struct encode_b_args arg = { x,         1,      NULL,   NULL,
+  struct encode_b_args arg = { x,
+                               1,     // enable_trellis_opt
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // above entropy context
+                               NULL,  // left entropy context
                                &mi->skip, mi_row, mi_col, output_enabled };
 #else
-  struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip };
+  struct encode_b_args arg = { x,
+                               1,     // enable_trellis_opt
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // above entropy context
+                               NULL,  // left entropy context
+                               &mi->skip };
   (void)mi_row;
   (void)mi_col;
   (void)output_enabled;
@@ -780,9 +790,9 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
       const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
       vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane],
                                ctx.tl[plane]);
-      arg.enable_coeff_opt = 1;
+      arg.enable_trellis_opt = 1;
     } else {
-      arg.enable_coeff_opt = 0;
+      arg.enable_trellis_opt = 0;
     }
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
@@ -814,17 +824,13 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
+  int enable_trellis_opt = !x->skip_recode;
   ENTROPY_CONTEXT *a = NULL;
   ENTROPY_CONTEXT *l = NULL;
   int entropy_ctx = 0;
   dst = &pd->dst.buf[4 * (row * dst_stride + col)];
   src = &p->src.buf[4 * (row * src_stride + col)];
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
-  if (args->enable_coeff_opt) {
-    a = &args->ta[col];
-    l = &args->tl[row];
-    entropy_ctx = combine_entropy_contexts(*a, *l);
-  }
 
   if (tx_size == TX_4X4) {
     tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
@@ -848,20 +854,42 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   // skip block condition should be handled before this is called.
   assert(!x->skip_block);
 
+  if (!x->skip_recode) {
+    const int tx_size_in_pixels = (1 << tx_size) << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                                diff_stride, src, src_stride, dst, dst_stride,
+                                xd->bd);
+    } else {
+      vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                         diff_stride, src, src_stride, dst, dst_stride);
+    }
+#else
+    vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                       diff_stride, src, src_stride, dst, dst_stride);
+#endif
+    enable_trellis_opt = do_trellis_opt(args);
+  }
+
+  if (enable_trellis_opt) {
+    a = &args->ta[col];
+    l = &args->tl[row];
+    entropy_ctx = combine_entropy_contexts(*a, *l);
+  }
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     switch (tx_size) {
       case TX_32X32:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
           vpx_highbd_quantize_b_32x32(
               coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
               dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -870,8 +898,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         break;
       case TX_16X16:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type == DCT_DCT)
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
@@ -880,7 +906,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                 p->quant_shift, qcoeff, dqcoeff, pd->dequant,
                                 eob, scan_order->scan, scan_order->iscan);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -890,8 +916,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         break;
       case TX_8X8:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type == DCT_DCT)
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
@@ -900,7 +924,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                 p->quant_shift, qcoeff, dqcoeff, pd->dequant,
                                 eob, scan_order->scan, scan_order->iscan);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -911,8 +935,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       default:
         assert(tx_size == TX_4X4);
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type != DCT_DCT)
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
@@ -921,7 +943,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                 p->quant_shift, qcoeff, dqcoeff, pd->dequant,
                                 eob, scan_order->scan, scan_order->iscan);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -945,14 +967,12 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       if (!x->skip_recode) {
-        vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                              scan_order->scan, scan_order->iscan);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -960,14 +980,12 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       break;
     case TX_16X16:
       if (!x->skip_recode) {
-        vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
                        qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -975,14 +993,12 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       break;
     case TX_8X8:
       if (!x->skip_recode) {
-        vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
                        qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -991,8 +1007,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     default:
       assert(tx_size == TX_4X4);
       if (!x->skip_recode) {
-        vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
@@ -1001,7 +1015,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                        qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob) {
@@ -1019,28 +1033,39 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
 }
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b) {
+                                  int enable_trellis_opt) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
 #if CONFIG_MISMATCH_DEBUG
   // TODO(angiebird): make mismatch_debug support intra mode
   struct encode_b_args arg = {
-    x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, 0, 0,
-    0
+    x,
+    enable_trellis_opt,
+    0.0,  // trellis_opt_thresh
+    ctx.ta[plane],
+    ctx.tl[plane],
+    &xd->mi[0]->skip,
+    0,  // mi_row
+    0,  // mi_col
+    0   // output_enabled
   };
 #else
-  struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane],
-                               ctx.tl[plane], &xd->mi[0]->skip };
+  struct encode_b_args arg = { x,
+                               enable_trellis_opt,
+                               0.0,  // trellis_opt_thresh
+                               ctx.ta[plane],
+                               ctx.tl[plane],
+                               &xd->mi[0]->skip };
 #endif
 
-  if (enable_optimize_b && x->optimize &&
+  if (enable_trellis_opt && x->optimize &&
       (!x->skip_recode || !x->skip_optimize)) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const TX_SIZE tx_size =
         plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size;
     vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
   } else {
-    arg.enable_coeff_opt = 0;
+    arg.enable_trellis_opt = 0;
   }
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 1975ee73ac..4091b02149 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -20,7 +20,8 @@ extern "C" {
 
 struct encode_b_args {
   MACROBLOCK *x;
-  int enable_coeff_opt;
+  int enable_trellis_opt;
+  double trellis_opt_thresh;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   int8_t *skip;
@@ -48,7 +49,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b);
+                                  int enable_trellis_opt);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 77de5c8754..0e95037dc2 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1478,6 +1478,25 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
   }
 }
 
+// Check if trellis coefficient optimization of the transform block is enabled.
+static INLINE int do_trellis_opt(void *arg) {
+  const struct encode_b_args *const args = (struct encode_b_args *)arg;
+  const MACROBLOCK *const x = args->x;
+  const int enable_trellis_opt = args->enable_trellis_opt;
+  const double trellis_opt_thresh = args->trellis_opt_thresh;
+
+  switch (enable_trellis_opt) {
+    case DISABLE_TRELLIS_OPT: return 0;
+    case ENABLE_TRELLIS_OPT: return 1;
+    case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: {
+      return (trellis_opt_thresh > 0.0)
+                 ? (x->log_block_src_var <= trellis_opt_thresh)
+                 : 1;
+    }
+    default: assert(0 && "Invalid trellis optimization method."); return 1;
+  }
+}
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
 static INLINE void start_timing(VP9_COMP *cpi, int component) {
   vpx_usec_timer_start(&cpi->component_timer[component]);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f87ab3e0bc..3a68952916 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -562,7 +562,7 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
   return sse;
 }
 
-// Compute the squares sum squares on all visible 4x4s in the transform block.
+// Compute the sum of squares on all visible 4x4s in the transform block.
 static int64_t sum_squares_visible(const MACROBLOCKD *xd,
                                    const struct macroblockd_plane *const pd,
                                    const int16_t *diff, const int diff_stride,
@@ -749,20 +749,34 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
   const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+  const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method;
+  const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh;
+#if CONFIG_MISMATCH_DEBUG
+  struct encode_b_args encode_b_arg = {
+    x,
+    enable_trellis_opt,
+    trellis_opt_thresh,
+    args->t_above,
+    args->t_left,
+    &mi->skip,
+    0,  // mi_row
+    0,  // mi_col
+    0   // output_enabled
+  };
+#else
+  struct encode_b_args encode_b_arg = { x,
+                                        enable_trellis_opt,
+                                        trellis_opt_thresh,
+                                        args->t_above,
+                                        args->t_left,
+                                        &mi->skip };
+#endif
 
   if (args->exit_early) return;
 
   if (!is_inter_block(mi)) {
-#if CONFIG_MISMATCH_DEBUG
-    struct encode_b_args intra_arg = {
-      x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip, 0, 0, 0
-    };
-#else
-    struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above,
-                                       args->t_left, &mi->skip };
-#endif
     vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                           &intra_arg);
+                           &encode_b_arg);
     if (recon) {
       uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
       copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
@@ -803,9 +817,10 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 
     if (skip_txfm_flag == SKIP_TXFM_NONE ||
         (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
+      const int enable_trellis_opt = do_trellis_opt(&encode_b_arg);
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-      if (x->block_qcoeff_opt)
+      if (enable_trellis_opt)
         vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
                  tx_size, &dist, &sse, recon);
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index f47e3d71c9..d07bb34ae1 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -275,8 +275,10 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
 
     sf->allow_txfm_domain_distortion = 1;
     sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5];
-    sf->allow_quant_coeff_opt = sf->optimize_coefficients;
-    sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5];
+    sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                       ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR
+                                       : DISABLE_TRELLIS_OPT;
+    sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5];
     sf->less_rectangular_check = 1;
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
@@ -470,8 +472,8 @@ static void set_rt_speed_feature_framesize_independent(
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
     sf->tx_domain_thresh = 0.0;
-    sf->allow_quant_coeff_opt = 0;
-    sf->quant_opt_thresh = 0.0;
+    sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT;
+    sf->trellis_opt_tx_rd.thresh = 0.0;
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check = 1;
     sf->tx_size_search_method =
@@ -946,8 +948,9 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->adaptive_interp_filter_search = 0;
   sf->allow_txfm_domain_distortion = 0;
   sf->tx_domain_thresh = 99.0;
-  sf->allow_quant_coeff_opt = sf->optimize_coefficients;
-  sf->quant_opt_thresh = 99.0;
+  sf->trellis_opt_tx_rd.method =
+      sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = 99.0;
   sf->allow_acl = 1;
   sf->enable_tpl_model = oxcf->enable_tpl_model;
   sf->prune_ref_frame_for_rect_partitions = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index e30a26084a..fceeb94023 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -246,6 +246,21 @@ typedef enum {
   USE_8_TAPS_SHARP,
 } SUBPEL_SEARCH_TYPE;
 
+typedef enum {
+  // Disable trellis coefficient optimization
+  DISABLE_TRELLIS_OPT,
+  // Enable trellis coefficient optimization
+  ENABLE_TRELLIS_OPT,
+  // Enable trellis coefficient optimization based on source variance of the
+  // prediction block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR,
+} ENABLE_TRELLIS_OPT_METHOD;
+
+typedef struct TRELLIS_OPT_CONTROL {
+  ENABLE_TRELLIS_OPT_METHOD method;
+  double thresh;
+} TRELLIS_OPT_CONTROL;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -292,8 +307,8 @@ typedef struct SPEED_FEATURES {
   int coeff_prob_appx_step;
 
   // Enable uniform quantizer followed by trellis coefficient optimization
-  int allow_quant_coeff_opt;
-  double quant_opt_thresh;
+  // during transform RD
+  TRELLIS_OPT_CONTROL trellis_opt_tx_rd;
 
   // Enable asymptotic closed-loop encoding decision for key frame and
   // alternate reference frames.

From 55e102dc54844b2f749395dbcd53d1e01b0e5030 Mon Sep 17 00:00:00 2001
From: Deepa K G <deepa.kg@ittiam.com>
Date: Thu, 2 Mar 2023 13:39:55 +0530
Subject: [PATCH 612/926] Skip trellis coeff opt based on tx block properties

The trellis coefficient optimization is skipped for blocks
with larger residual mse.

                 Instruction Count        BD-Rate Loss(%)
cpu   Resolution   Reduction(%)    avg.psnr   ovr.psnr    ssim
 0       LOWRES2      9.467        0.0921     0.1057    0.0362
 0       MIDRES2      4.328       -0.0155     0.0694    0.0178
 0        HDRES2      1.858        0.0231     0.0214   -0.0034
 0       Average      5.218        0.0332     0.0655    0.0169

STATS_CHANGED

Change-Id: I321a9b1a34ebb59b7b6a065b5b2d717c8767a4a5
---
 vp9/encoder/vp9_encodemb.c       | 15 ++++-
 vp9/encoder/vp9_encodemb.h       |  2 +
 vp9/encoder/vp9_encoder.h        | 93 ++++++++++++++++++++++++++++--
 vp9/encoder/vp9_rdopt.c          | 97 +++++++++++---------------------
 vp9/encoder/vp9_speed_features.c |  4 ++
 vp9/encoder/vp9_speed_features.h |  3 +
 6 files changed, 141 insertions(+), 73 deletions(-)

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 7c61419f8b..c079aa0547 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -763,6 +763,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
   struct encode_b_args arg = { x,
                                1,     // enable_trellis_opt
                                0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
                                NULL,  // above entropy context
                                NULL,  // left entropy context
                                &mi->skip, mi_row, mi_col, output_enabled };
@@ -770,6 +772,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
   struct encode_b_args arg = { x,
                                1,     // enable_trellis_opt
                                0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
                                NULL,  // above entropy context
                                NULL,  // left entropy context
                                &mi->skip };
@@ -869,7 +873,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
                        diff_stride, src, src_stride, dst, dst_stride);
 #endif
-    enable_trellis_opt = do_trellis_opt(args);
+    enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col,
+                                        plane_bsize, tx_size, args);
   }
 
   if (enable_trellis_opt) {
@@ -1041,7 +1046,9 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
   struct encode_b_args arg = {
     x,
     enable_trellis_opt,
-    0.0,  // trellis_opt_thresh
+    0.0,   // trellis_opt_thresh
+    NULL,  // &sse_calc_done
+    NULL,  // &sse
     ctx.ta[plane],
     ctx.tl[plane],
     &xd->mi[0]->skip,
@@ -1052,7 +1059,9 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
 #else
   struct encode_b_args arg = { x,
                                enable_trellis_opt,
-                               0.0,  // trellis_opt_thresh
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
                                ctx.ta[plane],
                                ctx.tl[plane],
                                &xd->mi[0]->skip };
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 4091b02149..1391446bed 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -22,6 +22,8 @@ struct encode_b_args {
   MACROBLOCK *x;
   int enable_trellis_opt;
   double trellis_opt_thresh;
+  int *sse_calc_done;
+  int64_t *sse;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   int8_t *skip;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 0e95037dc2..442ef1899c 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -14,6 +14,7 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vpx_ext_ratectrl.h"
 #include "vpx/vp8cx.h"
@@ -1478,21 +1479,101 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
   }
 }
 
+static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
+                                  int subsampling_dim, int blk_dim) {
+  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
+}
+
+// Compute the sum of squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd,
+                                   const struct macroblockd_plane *const pd,
+                                   const int16_t *diff, const int diff_stride,
+                                   int blk_row, int blk_col,
+                                   const BLOCK_SIZE plane_bsize,
+                                   const BLOCK_SIZE tx_bsize,
+                                   int *visible_width, int *visible_height) {
+  int64_t sse;
+  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+  const int b4x4s_to_right_edge = num_4x4_to_edge(
+      plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col);
+  const int b4x4s_to_bottom_edge = num_4x4_to_edge(
+      plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row);
+  if (tx_bsize == BLOCK_4X4 ||
+      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+    assert(tx_4x4_w == tx_4x4_h);
+    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
+    *visible_width = tx_4x4_w << 2;
+    *visible_height = tx_4x4_h << 2;
+  } else {
+    int r, c;
+    const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+    const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+    sse = 0;
+    // if we are in the unrestricted motion border.
+    for (r = 0; r < max_r; ++r) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      for (c = 0; c < max_c; ++c) {
+        sse += (int64_t)vpx_sum_squares_2d_i16(
+            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
+      }
+    }
+    *visible_width = max_c << 2;
+    *visible_height = max_r << 2;
+  }
+  return sse;
+}
+
 // Check if trellis coefficient optimization of the transform block is enabled.
-static INLINE int do_trellis_opt(void *arg) {
+static INLINE int do_trellis_opt(const struct macroblockd_plane *pd,
+                                 const int16_t *src_diff, int diff_stride,
+                                 int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 void *arg) {
   const struct encode_b_args *const args = (struct encode_b_args *)arg;
   const MACROBLOCK *const x = args->x;
-  const int enable_trellis_opt = args->enable_trellis_opt;
-  const double trellis_opt_thresh = args->trellis_opt_thresh;
 
-  switch (enable_trellis_opt) {
+  switch (args->enable_trellis_opt) {
     case DISABLE_TRELLIS_OPT: return 0;
     case ENABLE_TRELLIS_OPT: return 1;
     case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: {
-      return (trellis_opt_thresh > 0.0)
-                 ? (x->log_block_src_var <= trellis_opt_thresh)
+      vpx_clear_system_state();
+
+      return (args->trellis_opt_thresh > 0.0)
+                 ? (x->log_block_src_var <= args->trellis_opt_thresh)
                  : 1;
     }
+    case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: {
+      const MACROBLOCKD *const xd = &x->e_mbd;
+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int dequant_shift =
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+#else
+      const int dequant_shift = 3;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      const int qstep = pd->dequant[1] >> dequant_shift;
+      int *sse_calc_done = args->sse_calc_done;
+      int64_t *sse = args->sse;
+      int visible_width = 0, visible_height = 0;
+
+      // TODO: Enable the sf for high bit-depth case
+      if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse ||
+          !sse_calc_done)
+        return 1;
+
+      *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row,
+                                 blk_col, plane_bsize, tx_bsize, &visible_width,
+                                 &visible_height);
+      *sse_calc_done = 1;
+
+      vpx_clear_system_state();
+
+      return (*(sse) <= (int64_t)visible_width * visible_height * qstep *
+                            qstep * args->trellis_opt_thresh);
+    }
     default: assert(0 && "Invalid trellis optimization method."); return 1;
   }
 }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3a68952916..88e7b538dc 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -457,11 +457,6 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   return cost;
 }
 
-static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
-                                  int subsampling_dim, int blk_dim) {
-  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
-}
-
 // Copy all visible 4x4s in the transform block.
 static void copy_block_visible(const MACROBLOCKD *xd,
                                const struct macroblockd_plane *const pd,
@@ -562,47 +557,11 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
   return sse;
 }
 
-// Compute the sum of squares on all visible 4x4s in the transform block.
-static int64_t sum_squares_visible(const MACROBLOCKD *xd,
-                                   const struct macroblockd_plane *const pd,
-                                   const int16_t *diff, const int diff_stride,
-                                   int blk_row, int blk_col,
-                                   const BLOCK_SIZE plane_bsize,
-                                   const BLOCK_SIZE tx_bsize) {
-  int64_t sse;
-  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
-  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
-  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
-                                            pd->subsampling_x, blk_col);
-  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
-                                             pd->subsampling_y, blk_row);
-  if (tx_bsize == BLOCK_4X4 ||
-      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
-    assert(tx_4x4_w == tx_4x4_h);
-    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
-  } else {
-    int r, c;
-    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
-    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
-    sse = 0;
-    // if we are in the unrestricted motion border.
-    for (r = 0; r < max_r; ++r) {
-      // Skip visiting the sub blocks that are wholly within the UMV.
-      for (c = 0; c < max_c; ++c) {
-        sse += (int64_t)vpx_sum_squares_2d_i16(
-            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
-      }
-    }
-  }
-  return sse;
-}
-
 static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
                        BLOCK_SIZE plane_bsize, int block, int blk_row,
                        int blk_col, TX_SIZE tx_size, int64_t *out_dist,
-                       int64_t *out_sse, struct buf_2d *out_recon) {
+                       int64_t *out_sse, struct buf_2d *out_recon,
+                       int sse_calc_done) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -652,8 +611,12 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
     unsigned int tmp;
 
-    tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
-                    blk_col, plane_bsize, tx_bsize);
+    if (sse_calc_done) {
+      tmp = (unsigned int)(*out_sse);
+    } else {
+      tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
+                      blk_col, plane_bsize, tx_bsize);
+    }
     *out_sse = (int64_t)tmp * 16;
     if (out_recon) {
       const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col);
@@ -751,25 +714,20 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
   const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method;
   const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh;
+  int sse_calc_done = 0;
 #if CONFIG_MISMATCH_DEBUG
   struct encode_b_args encode_b_arg = {
-    x,
-    enable_trellis_opt,
-    trellis_opt_thresh,
-    args->t_above,
-    args->t_left,
-    &mi->skip,
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip,
     0,  // mi_row
     0,  // mi_col
     0   // output_enabled
   };
 #else
-  struct encode_b_args encode_b_arg = { x,
-                                        enable_trellis_opt,
-                                        trellis_opt_thresh,
-                                        args->t_above,
-                                        args->t_left,
-                                        &mi->skip };
+  struct encode_b_args encode_b_arg = {
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip
+  };
 #endif
 
   if (args->exit_early) return;
@@ -784,16 +742,21 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     }
     if (x->block_tx_domain) {
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, /*recon =*/0);
+                 tx_size, &dist, &sse, /*recon =*/0, sse_calc_done);
     } else {
       const struct macroblock_plane *const p = &x->plane[plane];
       const int src_stride = p->src.stride;
-      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
       const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
       unsigned int tmp;
-      sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
-                                plane_bsize, tx_bsize);
+      if (!sse_calc_done) {
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+        const int16_t *diff =
+            &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+        int visible_width, visible_height;
+        sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
+                                  plane_bsize, tx_bsize, &visible_width,
+                                  &visible_height);
+      }
 #if CONFIG_VP9_HIGHBITDEPTH
       if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
         sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
@@ -817,13 +780,19 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 
     if (skip_txfm_flag == SKIP_TXFM_NONE ||
         (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
-      const int enable_trellis_opt = do_trellis_opt(&encode_b_arg);
+      const struct macroblock_plane *const p = &x->plane[plane];
+      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+      const int16_t *const diff =
+          &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+      const int enable_trellis_opt =
+          do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize,
+                         tx_size, &encode_b_arg);
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
       if (enable_trellis_opt)
         vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, recon);
+                 tx_size, &dist, &sse, recon, sse_calc_done);
     } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index d07bb34ae1..3e121b799f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -229,6 +229,10 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->use_square_partition_only = !boosted;
   sf->early_term_interp_search_plane_rd = 1;
   sf->cb_pred_filter_search = 1;
+  sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                     ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE
+                                     : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0;
 
   sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index fceeb94023..d32bf09e4e 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -254,6 +254,9 @@ typedef enum {
   // Enable trellis coefficient optimization based on source variance of the
   // prediction block during transform RD
   ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR,
+  // Enable trellis coefficient optimization based on residual mse of the
+  // transform block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE,
 } ENABLE_TRELLIS_OPT_METHOD;
 
 typedef struct TRELLIS_OPT_CONTROL {

From 9c15fb62b3dfe1c698dc28f9efedb022b0ef8eb8 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 17 Mar 2023 14:34:42 -0400
Subject: [PATCH 613/926] Add codec control to get tpl stats

Add command line flag to vpxenc to export tpl stats

Bug: b/273736974
Change-Id: I6980096531b0c12fbf7a307fdef4c562d0c29e32
---
 vp9/vp9_cx_iface.c | 23 +++++++++++++++++++++++
 vpx/vp8cx.h        |  9 +++++++++
 vpxenc.c           | 20 ++++++++++++++++++--
 vpxenc.h           |  1 +
 4 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 4c7eaed725..ec2105b24b 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1788,6 +1788,28 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  VP9_COMMON *const cm = &cpi->common;
+  TplDepFrame **data = va_arg(args, TplDepFrame **);
+  int i;
+  *data = vpx_calloc(MAX_ARF_GOP_SIZE, sizeof(TplDepFrame));
+  for (i = 0; i < MAX_ARF_GOP_SIZE; i++) {
+    const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+    const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+    const int copy_size = mi_cols * mi_rows * sizeof(*(*data)[i].tpl_stats_ptr);
+    (*data)[i] = cpi->tpl_stats[i];
+    (*data)[i].tpl_stats_ptr = NULL;
+    (*data)[i].tpl_stats_ptr =
+        vpx_calloc(mi_rows * mi_cols, sizeof(*(*data)[i].tpl_stats_ptr));
+    memcpy((*data)[i].tpl_stats_ptr, cpi->tpl_stats[i].tpl_stats_ptr,
+           copy_size);
+  }
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
@@ -2035,6 +2057,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
   { VP9E_GET_LEVEL, ctrl_get_level },
   { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config },
+  { VP9E_GET_TPL_STATS, ctrl_get_tpl_stats },
 
   { -1, NULL },
 };
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index e0b679fbb7..01c0558673 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -767,6 +767,13 @@ enum vp8e_enc_control_id {
    *
    */
   VP9E_SET_QUANTIZER_ONE_PASS,
+
+  /*!\brief Codec control to get TPL stats for the current frame.
+   *
+   * Supported in codecs: VP9
+   *
+   */
+  VP9E_GET_TPL_STATS,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -1097,6 +1104,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
 VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
 #define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
+VPX_CTRL_USE_TYPE(VP9E_GET_TPL_STATS, void *)
+#define VPX_CTRL_VP9E_GET_TPL_STATS
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
diff --git a/vpxenc.c b/vpxenc.c
index 61672acadd..9d57708f37 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -39,6 +39,10 @@
 #include "vpx/vp8dx.h"
 #endif
 
+#if CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_encoder.h"
+#endif
+
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
@@ -161,6 +165,8 @@ static const arg_def_t disable_warnings =
 static const arg_def_t disable_warning_prompt =
     ARG_DEF("y", "disable-warning-prompt", 0,
             "Display warnings, but do not prompt user to continue.");
+static const arg_def_t export_tpl_stats =
+    ARG_DEF(NULL, "export-tpl-stats", 0, "Export TPL stats of vp9 encoder");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const arg_def_t test16bitinternalarg = ARG_DEF(
@@ -191,6 +197,7 @@ static const arg_def_t *main_args[] = { &help,
                                         &disable_warnings,
                                         &disable_warning_prompt,
                                         &recontest,
+                                        &export_tpl_stats,
                                         NULL };
 
 static const arg_def_t usage =
@@ -531,9 +538,7 @@ static const arg_def_t disable_loopfilter =
             "1: Loopfilter off for non reference frames\n"
             "                                          "
             "2: Loopfilter off for all frames");
-#endif
 
-#if CONFIG_VP9_ENCODER
 static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
                                        &auto_altref_vp9,
                                        &sharpness,
@@ -804,6 +809,8 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
       global->disable_warnings = 1;
     else if (arg_match(&arg, &disable_warning_prompt, argi))
       global->disable_warning_prompt = 1;
+    else if (arg_match(&arg, &export_tpl_stats, argi))
+      global->export_tpl_stats = 1;
     else
       argj++;
   }
@@ -1982,6 +1989,15 @@ int main(int argc, const char **argv_) {
 
         if (got_data && global.test_decode != TEST_DECODE_OFF)
           FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec));
+
+#if CONFIG_VP9_ENCODER
+        if (got_data && global.export_tpl_stats) {
+          TplDepFrame *tpl_stats = NULL;
+          FOREACH_STREAM(vpx_codec_control(&stream->encoder, VP9E_GET_TPL_STATS,
+                                           &tpl_stats));
+          vpx_free(tpl_stats);
+        }
+#endif
       }
 
       fflush(stdout);
diff --git a/vpxenc.h b/vpxenc.h
index be54840f7d..f065f086db 100644
--- a/vpxenc.h
+++ b/vpxenc.h
@@ -56,6 +56,7 @@ struct VpxEncoderConfig {
   int disable_warnings;
   int disable_warning_prompt;
   int experimental_bitstream;
+  int export_tpl_stats;
 };
 
 #ifdef __cplusplus

From e4f0df53ece296c4cb7c7d7911025e020bc6e882 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 20 Mar 2023 16:43:47 -0700
Subject: [PATCH 614/926] vp8_sixtap_predict16x16_neon: fix overread

Shift the final read from the source by 3 to avoid breaking the
assumption that the 6-tap filter needs only 5 pixels outside of the
macroblock; this matches the sse2 and ssse3 implementations.

It's possible this restriction could be removed if the source buffers
are assumed to be padded.

Bug: webm:1795
Change-Id: I4c791e3a214898a503c78f4cedca154c75cdbaef
Fixed: webm:1795
---
 test/predict_test.cc                     | 4 +---
 vp8/common/arm/neon/sixtappredict_neon.c | 8 +++-----
 vp8/common/rtcd_defs.pl                  | 4 +---
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/test/predict_test.cc b/test/predict_test.cc
index e49d98272e..7472970576 100644
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -307,9 +307,7 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, SixtapPredictTest,
-    ::testing::Values(/*TODO(https://crbug.com/webm/1795): enable this after
-                        buffer overflows are fixed.
-                      make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),*/
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
                       make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
                       make_tuple(8, 4, &vp8_sixtap_predict8x4_neon),
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_neon)));
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index 4960d16516..b15cfb4112 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -1253,9 +1253,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
   return;
 }
 
-// TODO(https://crbug.com/webm/1795): enable this after buffer overflows are
-// fixed.
-#if 0
 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
@@ -1507,7 +1504,9 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
     src += src_pixels_per_line;
     d12u8 = vld1_u8(src);
     d13u8 = vld1_u8(src + 8);
-    d14u8 = vld1_u8(src + 16);
+    // Only 5 pixels are needed, avoid a potential out of bounds read.
+    d14u8 = vld1_u8(src + 13);
+    d14u8 = vext_u8(d14u8, d14u8, 3);
     src += src_pixels_per_line;
 
     __builtin_prefetch(src);
@@ -1731,4 +1730,3 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
   }
   return;
 }
-#endif  // 0
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 05e67ce11b..739a612847 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -146,9 +146,7 @@ ()
 # Subpixel
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-# TODO(https://crbug.com/webm/1795): enable neon after buffer overflows are
-# fixed.
-specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 dspr2 msa mmi lsx/;
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/;

From faa9142f5d5af0a6c4b998929aac928a38515ae3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 20 Mar 2023 16:56:58 -0700
Subject: [PATCH 615/926] sixtappredict_neon.c,cosmetics: fix a typo

Change-Id: If3e4cf372fc6ed076f0d42c435a72262494aab68
---
 vp8/common/arm/neon/sixtappredict_neon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index b15cfb4112..a7cf43b5f9 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -16,7 +16,7 @@
 #include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
-  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positionyys are */
+  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positions are */
   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
   { 0, -9, 93, 50, -6, 0, 0, 0 },

From 44250287fb3d0c51118478a447fa032cab6e3700 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 20 Mar 2023 16:58:28 -0700
Subject: [PATCH 616/926] sixtappredict_neon.c: remove redundant returns

Change-Id: I650b305c2599fc32353daba030e6241d330796a7
---
 vp8/common/arm/neon/sixtappredict_neon.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index a7cf43b5f9..ee3c281f0f 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -781,7 +781,6 @@ void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
   vst1_u8(dst_ptr, d8u8);
   dst_ptr += dst_pitch;
   vst1_u8(dst_ptr, d9u8);
-  return;
 }
 
 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
@@ -1250,7 +1249,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
     vst1_u8(dst_ptr, d9u8);
     dst_ptr += dst_pitch;
   }
-  return;
 }
 
 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
@@ -1728,5 +1726,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
       dst += dst_pitch;
     }
   }
-  return;
 }

From 1c37aefcbd0aebc1f2043b3e9d2c9fd61e6275ef Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 20 Mar 2023 17:09:42 -0700
Subject: [PATCH 617/926] svc_encodeframe.c: fix -Wstringop-truncation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

use sizeof(buf) - 1 with strncpy.

fixes:
examples/svc_encodeframe.c:282:3: warning: ‘strncpy’ specified bound
1024 equals destination size [-Wstringop-truncation]
  282 |   strncpy(si->options, options, sizeof(si->options));
      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Change-Id: I46980872f9865ae1dc2b56330c3a65d8bc6cf1f7
---
 examples/svc_encodeframe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c
index 003096e701..c2b3ec9798 100644
--- a/examples/svc_encodeframe.c
+++ b/examples/svc_encodeframe.c
@@ -279,7 +279,7 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
   if (svc_ctx == NULL || options == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  strncpy(si->options, options, sizeof(si->options));
+  strncpy(si->options, options, sizeof(si->options) - 1);
   si->options[sizeof(si->options) - 1] = '\0';
   return VPX_CODEC_OK;
 }

From 3b6909977c99580b565b4150823b3f9c17c39bc0 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 20 Mar 2023 17:28:11 -0700
Subject: [PATCH 618/926] test.mk: use CONFIG_VP(8|9)_ENCODER for vp8/vp9-only
 tests

fixes some uninstantiated test failures when configured with
--disable-vp8 or --disable-vp9

Change-Id: If9a6705bd070edee02306e89da103ed474688ec8
---
 test/test.mk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test.mk b/test/test.mk
index 3c225bc750..bbcdd0c6e4 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -22,10 +22,6 @@ LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += alt_ref_aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp8_datarate_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp9_datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
@@ -37,6 +33,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_datarate_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
@@ -44,6 +41,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += alt_ref_aq_segment_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
@@ -58,6 +57,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h
 

From 5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 10 Mar 2023 16:30:36 +0000
Subject: [PATCH 619/926] Add Neon implementations of vpx_highbd_avg_<w>x<h>_c

Add Neon implementation of vpx_highbd_avg_4x4_c and vpx_highbd_avg_8x8_c
as well as the corresponding tests.

Change-Id: Ib1b06af5206774347690c9c56e194b76aa409c91
---
 test/avg_test.cc              |  7 +++++++
 vpx_dsp/arm/highbd_avg_neon.c | 24 ++++++++++++++++++++++++
 vpx_dsp/arm/mem_neon.h        | 21 +++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  4 ++--
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/test/avg_test.cc b/test/avg_test.cc
index dd84403324..a0428304a2 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -582,6 +582,13 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2)));
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AverageTestHBD,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon),
+                      make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon)));
+#endif  // HAVE_NEON
+
 INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest,
                          ::testing::Values(make_tuple(16, &vpx_satd_c),
                                            make_tuple(64, &vpx_satd_c),
diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
index b84a7875d4..fc10197d71 100644
--- a/vpx_dsp/arm/highbd_avg_neon.c
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -16,6 +16,30 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
+uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * a_stride, a_stride);
+  const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * a_stride, a_stride);
+  return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+  load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  sum = vaddq_u16(a0, a1);
+  sum = vaddq_u16(sum, a2);
+  sum = vaddq_u16(sum, a3);
+  sum = vaddq_u16(sum, a4);
+  sum = vaddq_u16(sum, a5);
+  sum = vaddq_u16(sum, a6);
+  sum = vaddq_u16(sum, a7);
+
+  return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
 // coeff: 32 bits, dynamic range [-2147483648, 2147483647].
 // length: value range {16, 64, 256, 1024}.
 // satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index fa14f80b23..1a20da70ef 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -419,4 +419,25 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
   vst1q_u8(s, s7);
 }
 
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+                                uint16x8_t *s6, uint16x8_t *s7) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+}
+
 #endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 7cd3a0be89..6637186f81 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -995,10 +995,10 @@ ()
   # Avg
   #
   add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
-  specialize qw/vpx_highbd_avg_8x8 sse2/;
+  specialize qw/vpx_highbd_avg_8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
-  specialize qw/vpx_highbd_avg_4x4 sse2/;
+  specialize qw/vpx_highbd_avg_4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
   specialize qw/vpx_highbd_minmax_8x8 neon/;

From 78bb8e1c0a9b386b983c4e7cadf2ffb7b3b52bd5 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 22 Mar 2023 20:18:39 +0000
Subject: [PATCH 620/926] Revert "Add codec control to get tpl stats"

This reverts commit 9c15fb62b3dfe1c698dc28f9efedb022b0ef8eb8.

Reason for revert:

vpxenc should only use public interface

Original change's description:
> Add codec control to get tpl stats
>
> Add command line flag to vpxenc to export tpl stats
>
> Bug: b/273736974
> Change-Id: I6980096531b0c12fbf7a307fdef4c562d0c29e32

Bug: b/273736974
Change-Id: Ifa8951bb34e5936bbfc33086b22e9fc36d379bc9
---
 vp9/vp9_cx_iface.c | 23 -----------------------
 vpx/vp8cx.h        |  9 ---------
 vpxenc.c           | 20 ++------------------
 vpxenc.h           |  1 -
 4 files changed, 2 insertions(+), 51 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index ec2105b24b..4c7eaed725 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1788,28 +1788,6 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
-                                          va_list args) {
-  VP9_COMP *const cpi = ctx->cpi;
-  VP9_COMMON *const cm = &cpi->common;
-  TplDepFrame **data = va_arg(args, TplDepFrame **);
-  int i;
-  *data = vpx_calloc(MAX_ARF_GOP_SIZE, sizeof(TplDepFrame));
-  for (i = 0; i < MAX_ARF_GOP_SIZE; i++) {
-    const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-    const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
-    const int copy_size = mi_cols * mi_rows * sizeof(*(*data)[i].tpl_stats_ptr);
-    (*data)[i] = cpi->tpl_stats[i];
-    (*data)[i].tpl_stats_ptr = NULL;
-    (*data)[i].tpl_stats_ptr =
-        vpx_calloc(mi_rows * mi_cols, sizeof(*(*data)[i].tpl_stats_ptr));
-    memcpy((*data)[i].tpl_stats_ptr, cpi->tpl_stats[i].tpl_stats_ptr,
-           copy_size);
-  }
-
-  return VPX_CODEC_OK;
-}
-
 static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
@@ -2057,7 +2035,6 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
   { VP9E_GET_LEVEL, ctrl_get_level },
   { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config },
-  { VP9E_GET_TPL_STATS, ctrl_get_tpl_stats },
 
   { -1, NULL },
 };
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 01c0558673..e0b679fbb7 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -767,13 +767,6 @@ enum vp8e_enc_control_id {
    *
    */
   VP9E_SET_QUANTIZER_ONE_PASS,
-
-  /*!\brief Codec control to get TPL stats for the current frame.
-   *
-   * Supported in codecs: VP9
-   *
-   */
-  VP9E_GET_TPL_STATS,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -1104,8 +1097,6 @@ VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
 VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
 #define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
-VPX_CTRL_USE_TYPE(VP9E_GET_TPL_STATS, void *)
-#define VPX_CTRL_VP9E_GET_TPL_STATS
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
diff --git a/vpxenc.c b/vpxenc.c
index 9d57708f37..61672acadd 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -39,10 +39,6 @@
 #include "vpx/vp8dx.h"
 #endif
 
-#if CONFIG_VP9_ENCODER
-#include "vp9/encoder/vp9_encoder.h"
-#endif
-
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
@@ -165,8 +161,6 @@ static const arg_def_t disable_warnings =
 static const arg_def_t disable_warning_prompt =
     ARG_DEF("y", "disable-warning-prompt", 0,
             "Display warnings, but do not prompt user to continue.");
-static const arg_def_t export_tpl_stats =
-    ARG_DEF(NULL, "export-tpl-stats", 0, "Export TPL stats of vp9 encoder");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const arg_def_t test16bitinternalarg = ARG_DEF(
@@ -197,7 +191,6 @@ static const arg_def_t *main_args[] = { &help,
                                         &disable_warnings,
                                         &disable_warning_prompt,
                                         &recontest,
-                                        &export_tpl_stats,
                                         NULL };
 
 static const arg_def_t usage =
@@ -538,7 +531,9 @@ static const arg_def_t disable_loopfilter =
             "1: Loopfilter off for non reference frames\n"
             "                                          "
             "2: Loopfilter off for all frames");
+#endif
 
+#if CONFIG_VP9_ENCODER
 static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
                                        &auto_altref_vp9,
                                        &sharpness,
@@ -809,8 +804,6 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
       global->disable_warnings = 1;
     else if (arg_match(&arg, &disable_warning_prompt, argi))
       global->disable_warning_prompt = 1;
-    else if (arg_match(&arg, &export_tpl_stats, argi))
-      global->export_tpl_stats = 1;
     else
       argj++;
   }
@@ -1989,15 +1982,6 @@ int main(int argc, const char **argv_) {
 
         if (got_data && global.test_decode != TEST_DECODE_OFF)
           FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec));
-
-#if CONFIG_VP9_ENCODER
-        if (got_data && global.export_tpl_stats) {
-          TplDepFrame *tpl_stats = NULL;
-          FOREACH_STREAM(vpx_codec_control(&stream->encoder, VP9E_GET_TPL_STATS,
-                                           &tpl_stats));
-          vpx_free(tpl_stats);
-        }
-#endif
       }
 
       fflush(stdout);
diff --git a/vpxenc.h b/vpxenc.h
index f065f086db..be54840f7d 100644
--- a/vpxenc.h
+++ b/vpxenc.h
@@ -56,7 +56,6 @@ struct VpxEncoderConfig {
   int disable_warnings;
   int disable_warning_prompt;
   int experimental_bitstream;
-  int export_tpl_stats;
 };
 
 #ifdef __cplusplus

From 5817bce969f2845493f22f85ca5f19c70adc1c2f Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 20 Mar 2023 16:05:11 -0700
Subject: [PATCH 621/926] Fix comment typos (likely copy-and-paste errors)

Fix comment typos for vpx_codec_destroy() and vpx_codec_enc_init_ver().

Based on the change made in libaom:
https://aomedia.googlesource.com/aom/+/365a968684
365a968684 Fix comment typos (likely copy-and-paste errors)

Change-Id: I39edae835ed0752b569e8e7328d0709c59724ac2
---
 vpx/vpx_codec.h   | 8 +++++---
 vpx/vpx_decoder.h | 2 +-
 vpx/vpx_encoder.h | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index b0a931e019..11bf8aaa22 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -345,9 +345,11 @@ const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
  * \param[in] ctx   Pointer to this instance's context
  *
  * \retval #VPX_CODEC_OK
- *     The codec algorithm initialized.
- * \retval #VPX_CODEC_MEM_ERROR
- *     Memory allocation failed.
+ *     The codec instance has been destroyed.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     ctx is a null pointer.
+ * \retval #VPX_CODEC_ERROR
+ *     Codec context not initialized.
  */
 vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
 
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 39e5f585f6..99dd8cf694 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -127,7 +127,7 @@ typedef struct vpx_codec_dec_cfg {
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_DECODER_ABI_VERSION
  * \retval #VPX_CODEC_OK
- *     The decoder algorithm initialized.
+ *     The decoder algorithm has been initialized.
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory allocation failed.
  */
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index efaf5ef366..a0d2c87558 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -906,7 +906,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_ENCODER_ABI_VERSION
  * \retval #VPX_CODEC_OK
- *     The decoder algorithm initialized.
+ *     The encoder algorithm has been initialized.
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory allocation failed.
  */

From cda56fa0199a77929f72edd78ad8d4e0ca4968e7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 23 Mar 2023 19:02:12 -0700
Subject: [PATCH 622/926] update libwebm to libwebm-1.0.0.29-9-g1930e3c

changelog:
https://chromium.googlesource.com/webm/libwebm/+log/ee0bab576..1930e3ca2

Bug: webm:1792
Change-Id: I5c5c30c767d357528f102ff38957655e2ec0c645
---
 third_party/libwebm/AUTHORS.TXT              | 1 +
 third_party/libwebm/Android.mk               | 3 +++
 third_party/libwebm/README.libvpx            | 5 +++--
 third_party/libwebm/mkvmuxer/mkvmuxer.cc     | 4 ++--
 third_party/libwebm/mkvmuxer/mkvmuxer.h      | 8 ++++----
 third_party/libwebm/mkvmuxer/mkvmuxerutil.cc | 2 +-
 third_party/libwebm/mkvparser/mkvparser.cc   | 6 +++---
 7 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/third_party/libwebm/AUTHORS.TXT b/third_party/libwebm/AUTHORS.TXT
index 9686ac13eb..59b648ca68 100644
--- a/third_party/libwebm/AUTHORS.TXT
+++ b/third_party/libwebm/AUTHORS.TXT
@@ -2,3 +2,4 @@
 # Name or Organization <email address>
 
 Google Inc.
+Elijah Cirioli <eli.cirioli@gmail.com>
diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk
index 23f935f2db..b02795ccae 100644
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@@ -1,3 +1,5 @@
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 LOCAL_PATH:= $(call my-dir)
 
 include $(CLEAR_VARS)
@@ -18,3 +20,4 @@ LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
 LOCAL_LICENSE_CONDITIONS := notice
 LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT
 include $(BUILD_STATIC_LIBRARY)
+endif  # NDK_ROOT
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 325604cc66..a79b982ef4 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,7 +1,7 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: ee0bab576c338c9807249b99588e352b7268cb62
+Version: 1930e3ca23b007f3ff11d98a570077be6201957e
 License: BSD
-License File: LICENSE.txt
+License File: LICENSE.TXT
 
 Description:
 libwebm is used to handle WebM container I/O.
@@ -18,3 +18,4 @@ Only keep:
  - mkvmuxer/
  - mkvparser/
  - PATENTS.TXT
+- use -std=gnu++11 in Android.mk (https://crbug.com/webm/1708)
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index ae36531439..faaf0165f4 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -607,10 +607,10 @@ bool ContentEncoding::Write(IMkvWriter* writer) const {
   return true;
 }
 
-uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
+uint64_t ContentEncoding::EncodingSize(uint64_t compression_size,
                                        uint64_t encryption_size) const {
   // TODO(fgalligan): Add support for compression settings.
-  if (compresion_size != 0)
+  if (compression_size != 0)
     return 0;
 
   uint64_t encoding_size = 0;
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.h b/third_party/libwebm/mkvmuxer/mkvmuxer.h
index f2db377145..8602d82325 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -330,7 +330,7 @@ class ContentEncoding {
 
  private:
   // Returns the size in bytes for the encoding elements.
-  uint64_t EncodingSize(uint64_t compresion_size,
+  uint64_t EncodingSize(uint64_t compression_size,
                         uint64_t encryption_size) const;
 
   // Returns the size in bytes for the encryption elements.
@@ -1425,7 +1425,7 @@ class SeekHead {
   bool Write(IMkvWriter* writer);
 
   // We are going to put a cap on the number of Seek Entries.
-  const static int32_t kSeekEntryCount = 5;
+  constexpr static int32_t kSeekEntryCount = 5;
 
  private:
   // Returns the maximum size in bytes of one seek entry.
@@ -1505,8 +1505,8 @@ class Segment {
     kBeforeClusters = 0x1  // Position Cues before Clusters
   };
 
-  static const uint32_t kDefaultDocTypeVersion = 4;
-  static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+  static constexpr uint32_t kDefaultDocTypeVersion = 4;
+  static constexpr uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
 
   Segment();
   ~Segment();
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index bd2f769138..300b155797 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -607,7 +607,7 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
 void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
   *minor = 3;
-  *build = 0;
+  *build = 1;
   *revision = 0;
 }
 
diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc
index de8884b381..868afcb3ed 100644
--- a/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -55,7 +55,7 @@ Type* SafeArrayAlloc(unsigned long long num_elements,
 void GetVersion(int& major, int& minor, int& build, int& revision) {
   major = 1;
   minor = 1;
-  build = 0;
+  build = 1;
   revision = 0;
 }
 
@@ -298,7 +298,7 @@ long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
   if (status < 0)
     return status;
 
-  unsigned long long result = first_byte;
+  unsigned long long result = static_cast<unsigned long long>(first_byte);
   ++pos;
 
   for (long i = 1; i < size; ++i) {
@@ -2432,7 +2432,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
     pos += size;  // consume payload
   }
 
-  if ((m_pos < 0) || (m_track <= 0)) {
+  if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) {
     return false;
   }
 

From 1701d55e33fe227cee4442d898c3ef6ae6a8206a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 13:53:49 -0800
Subject: [PATCH 623/926] vp9_encodeframe.c: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I77c7abae7bbb1e1f4972cd31e3a67d62477b896e
---
 vp9/encoder/vp9_encodeframe.c | 45 ++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1d593cfc01..26e419e3d5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -349,17 +349,17 @@ typedef struct {
   int32_t sum_error;
   int log2_count;
   int variance;
-} var;
+} Var;
 
 typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
+  Var none;
+  Var horz[2];
+  Var vert[2];
 } partition_variance;
 
 typedef struct {
   partition_variance part_variances;
-  var split[4];
+  Var split[4];
 } v4x4;
 
 typedef struct {
@@ -384,7 +384,7 @@ typedef struct {
 
 typedef struct {
   partition_variance *part_variances;
-  var *split[4];
+  Var *split[4];
 } variance_node;
 
 typedef enum {
@@ -436,13 +436,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
 }
 
 // Set variance values given sum square error, sum error, count.
-static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->log2_count = c;
 }
 
-static void get_variance(var *v) {
+static void get_variance(Var *v) {
   v->variance =
       (int)(256 * (v->sum_square_error -
                    (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
@@ -450,7 +450,7 @@ static void get_variance(var *v) {
             v->log2_count);
 }
 
-static void sum_2_variances(const var *a, const var *b, var *r) {
+static void sum_2_variances(const Var *a, const Var *b, Var *r) {
   assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
                 a->sum_error + b->sum_error, a->log2_count + 1, r);
@@ -1863,8 +1863,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
       vp9_update_mv_count(td);
 
       if (cm->interp_filter == SWITCHABLE) {
-        const int ctx = get_pred_context_switchable_interp(xd);
-        ++td->counts->switchable_interp[ctx][xdmi->interp_filter];
+        const int ctx_interp = get_pred_context_switchable_interp(xd);
+        ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter];
       }
     }
 
@@ -2748,10 +2748,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + (mi_step >> 1) < cm->mi_rows) {
         RD_COST tmp_rdc;
-        PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+        PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
         pc_tree->horizontal[1].skip_ref_frame_mask = 0;
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
                          &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX,
@@ -2772,10 +2772,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + (mi_step >> 1) < cm->mi_cols) {
         RD_COST tmp_rdc;
-        PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+        PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx);
         pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0;
         rd_pick_sb_modes(
             cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
@@ -2847,8 +2847,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       int x_idx = (i & 1) * (mi_step >> 1);
       int y_idx = (i >> 1) * (mi_step >> 1);
       RD_COST tmp_rdc;
-      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl[8], sa[8];
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
@@ -3479,8 +3477,8 @@ static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
                                           int mi_col, int *none, int *split) {
   const VP9_COMMON *const cm = &cpi->common;
   const NN_CONFIG *nn_config = NULL;
+  const MACROBLOCKD *const xd = &x->e_mbd;
 #if CONFIG_VP9_HIGHBITDEPTH
-  MACROBLOCKD *xd = &x->e_mbd;
   DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]);
   uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
                                 ? (CONVERT_TO_BYTEPTR(pred_buffer))
@@ -3563,7 +3561,6 @@ static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
       const unsigned int var =
           cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
       const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
-      const MACROBLOCKD *const xd = &x->e_mbd;
       const int has_above = !!xd->above_mi;
       const int has_left = !!xd->left_mi;
       const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize;
@@ -4348,9 +4345,9 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
-      PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+      PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
+      update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter;

From bad39ce7a3b766ea44fcd1637610986a4a672999 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 15:16:43 -0800
Subject: [PATCH 624/926] vp9_scan.h: rename scan_order struct to ScanOrder

This matches the style guide and fixes some -Wshadow warnings related to
variables with the same name. Something similar was done in libaom in:
03f6fdcfca Fix warnings reported by -Wshadow: Part1b: scan_order struct
           and variable

Bug: webm:1793
Change-Id: Ide5127886b7fd7778e6d8a983bfba6edda21ff28
---
 test/vp9_quantize_test.cc                 | 12 ++++++------
 vp9/common/vp9_scan.c                     |  4 ++--
 vp9/common/vp9_scan.h                     | 12 ++++++------
 vp9/decoder/vp9_decodeframe.c             | 16 ++++++++--------
 vp9/decoder/vp9_detokenize.c              |  5 ++---
 vp9/decoder/vp9_detokenize.h              |  5 ++---
 vp9/encoder/vp9_encodemb.c                |  8 ++++----
 vp9/encoder/vp9_pickmode.c                |  2 +-
 vp9/encoder/vp9_rdopt.c                   | 12 ++++++------
 vp9/encoder/vp9_tokenize.c                |  2 +-
 vp9/encoder/vp9_tpl_model.c               |  2 +-
 vpx_dsp/arm/highbd_quantize_neon.c        |  2 +-
 vpx_dsp/arm/quantize_neon.c               |  2 +-
 vpx_dsp/quantize.c                        |  4 ++--
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |  6 +++---
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c |  2 +-
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c |  2 +-
 vpx_dsp/x86/quantize_avx.c                |  2 +-
 vpx_dsp/x86/quantize_avx2.c               |  2 +-
 vpx_dsp/x86/quantize_ssse3.c              |  2 +-
 20 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index e9b17d5eb8..84a5a58e4e 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                              const macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
-                             const struct scan_order *const scan_order);
+                             const struct ScanOrder *const scan_order);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
@@ -60,7 +60,7 @@ template <QuantizeBaseFunc fn>
 void QuantWrapper(const tran_low_t *coeff, intptr_t count,
                   const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                   tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                  const struct scan_order *const scan_order) {
+                  const struct ScanOrder *const scan_order) {
   fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
      mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan,
      scan_order->iscan);
@@ -71,14 +71,14 @@ typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
                                   const macroblock_plane *const mb_plane,
                                   tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                   const int16_t *dequant, uint16_t *eob,
-                                  const struct scan_order *const scan_order);
+                                  const struct ScanOrder *const scan_order);
 
 template <Quantize32x32Func fn>
 void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
                        const macroblock_plane *const mb_plane,
                        tran_low_t *qcoeff, tran_low_t *dqcoeff,
                        const int16_t *dequant, uint16_t *eob,
-                       const struct scan_order *const scan_order) {
+                       const struct ScanOrder *const scan_order) {
   (void)count;
   fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
 }
@@ -94,7 +94,7 @@ template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
                     const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                     tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                    const struct scan_order *const scan_order) {
+                    const struct ScanOrder *const scan_order) {
   fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
      dequant, eob, scan_order->scan, scan_order->iscan);
 }
@@ -213,7 +213,7 @@ class VP9QuantizeBase : public AbstractBench {
   int16_t *r_ptr_;
   int16_t *q_ptr_;
   int count_;
-  const scan_order *scan_;
+  const ScanOrder *scan_;
   uint16_t eob_;
 };
 
diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
index 8bea61dea6..adacb7ef96 100644
--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
@@ -688,14 +688,14 @@ DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
   968,  974,  989,  997,  1003, 1007, 1015, 1019, 1022, 1024,
 };
 
-const scan_order vp9_default_scan_orders[TX_SIZES] = {
+const ScanOrder vp9_default_scan_orders[TX_SIZES] = {
   { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
   { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors },
   { default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors },
   { default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = {
+const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES] = {
   { // TX_4X4
     { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
     { row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors },
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index efa0e23365..3d1dcc66da 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -23,14 +23,14 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef struct scan_order {
+typedef struct ScanOrder {
   const int16_t *scan;
   const int16_t *iscan;
   const int16_t *neighbors;
-} scan_order;
+} ScanOrder;
 
-extern const scan_order vp9_default_scan_orders[TX_SIZES];
-extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
+extern const ScanOrder vp9_default_scan_orders[TX_SIZES];
+extern const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
                                    const uint8_t *token_cache, int c) {
@@ -39,8 +39,8 @@ static INLINE int get_coef_context(const int16_t *neighbors,
          1;
 }
 
-static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                                         PLANE_TYPE type, int block_idx) {
+static INLINE const ScanOrder *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                        PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi[0];
 
   if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) {
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 2a27e6fdb3..6eae41fcfb 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -323,9 +323,9 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
   if (!mi->skip) {
     const TX_TYPE tx_type =
         (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
-    const scan_order *sc = (plane || xd->lossless)
-                               ? &vp9_default_scan_orders[tx_size]
-                               : &vp9_scan_orders[tx_size][tx_type];
+    const ScanOrder *sc = (plane || xd->lossless)
+                              ? &vp9_default_scan_orders[tx_size]
+                              : &vp9_scan_orders[tx_size][tx_type];
     const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                             mi->segment_id);
     if (eob > 0) {
@@ -348,9 +348,9 @@ static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
     struct macroblockd_plane *const pd = &xd->plane[plane];
     const TX_TYPE tx_type =
         (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
-    const scan_order *sc = (plane || xd->lossless)
-                               ? &vp9_default_scan_orders[tx_size]
-                               : &vp9_scan_orders[tx_size][tx_type];
+    const ScanOrder *sc = (plane || xd->lossless)
+                              ? &vp9_default_scan_orders[tx_size]
+                              : &vp9_scan_orders[tx_size][tx_type];
     *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                        mi->segment_id);
     /* Keep the alignment to 16 */
@@ -393,7 +393,7 @@ static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
                                    int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
   uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
@@ -423,7 +423,7 @@ static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
                                     TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
 
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 3ed1bd6ffa..d957dc34e3 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -272,9 +272,8 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
   }
 }
 
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
-                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
-                            int seg_id) {
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+                            int x, int y, TX_SIZE tx_size, int seg_id) {
   vpx_reader *r = &twd->bit_reader;
   MACROBLOCKD *xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index a32052ffff..a8e47021b8 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -19,9 +19,8 @@
 extern "C" {
 #endif
 
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
-                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
-                            int seg_id);
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+                            int x, int y, TX_SIZE tx_size, int seg_id);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index a81a753ae7..946a1c3ee8 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -79,7 +79,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   const int shift = (tx_size == TX_32X32);
   const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
+  const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
   const MODE_INFO *mbmi = xd->mi[0];
@@ -351,7 +351,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -496,7 +496,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -816,7 +816,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const scan_order *scan_order;
+  const ScanOrder *scan_order;
   TX_TYPE tx_type = DCT_DCT;
   PREDICTION_MODE mode;
   const int bwl = b_width_log2_lookup[plane_bsize];
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 579b466ca9..c19d57d15d 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -768,7 +768,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
   for (r = 0; r < max_blocks_high; r += block_step) {
     for (c = 0; c < num_4x4_w; c += block_step) {
       if (c < max_blocks_wide) {
-        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
         tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
         tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
         tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 7557b536ce..76d545cd96 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -77,7 +77,7 @@ struct rdcost_block_args {
   int64_t best_rd;
   int exit_early;
   int use_fast_coef_costing;
-  const scan_order *so;
+  const ScanOrder *so;
   uint8_t skippable;
   struct buf_2d *this_recon;
 };
@@ -1129,7 +1129,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst,
                                     dst_stride, xd->bd);
           if (xd->lossless) {
-            const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+            const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             vp9_highbd_fwht4x4(src_diff, coeff, 8);
@@ -1146,7 +1146,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           } else {
             int64_t unused;
             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+            const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             if (tx_type == DCT_DCT)
@@ -1236,7 +1236,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
-          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fwht4x4(src_diff, coeff, 8);
@@ -1253,7 +1253,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         } else {
           int64_t unused;
           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+          const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
@@ -1632,7 +1632,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
-  const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+  const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(mi);
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
 
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 814d769be3..6c6c04493f 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -364,7 +364,7 @@ static void tokenize_b(int plane, int block, int row, int col,
   const PLANE_TYPE type = get_plane_type(plane);
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int16_t *scan, *nb;
-  const scan_order *so;
+  const ScanOrder *so;
   const int ref = is_inter_block(mi);
   unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index b0c735167e..53ef356981 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -434,7 +434,7 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   uint16_t eob;
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 5a40f1284e..526447acf5 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -228,7 +228,7 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
 void vpx_highbd_quantize_b_32x32_neon(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index 84b6d8c79f..cc8f623744 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -219,7 +219,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const struct scan_order *const scan_order) {
+                               const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index f51bf253e7..d44ced20dc 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -214,7 +214,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
                             const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const struct scan_order *const scan_order) {
+                            const struct ScanOrder *const scan_order) {
   const int n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
@@ -275,7 +275,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
 void vpx_highbd_quantize_b_32x32_c(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   const intptr_t n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 6637186f81..49bc9a6309 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -19,7 +19,7 @@ ()
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_VP9_ENCODER
  struct macroblock_plane;
- struct scan_order;
+ struct ScanOrder;
 #endif
 
 EOF
@@ -725,14 +725,14 @@ ()
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index bfd7b2e23e..fbebd7db1c 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -226,7 +226,7 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
 void vpx_highbd_quantize_b_32x32_avx2(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   const unsigned int step = 8;
   intptr_t n_coeffs = 32 * 32;
   const int16_t *iscan = scan_order->iscan;
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 58d5a3a5ff..a5d874f3bc 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -97,7 +97,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 void vpx_highbd_quantize_b_32x32_sse2(
     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const struct scan_order *const scan_order) {
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index d05a937be1..d289bf6ebf 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -146,7 +146,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
                               const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const struct scan_order *const scan_order) {
+                              const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 1c82542ae6..5421dcf0ba 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -256,7 +256,7 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const struct scan_order *const scan_order) {
+                               const struct ScanOrder *const scan_order) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 6401b2865d..556f4ca617 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -113,7 +113,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
                                 const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const struct scan_order *const scan_order) {
+                                const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index;
   const int16_t *iscan = scan_order->iscan;

From 601904d1f7f97b22efa955cb804ed1f5ba871eed Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 15:36:03 -0800
Subject: [PATCH 625/926] vp9_rc_get_second_pass_params: clear -Wshadow warning

Bug: webm:1793
Change-Id: I0d64c9234b4bdcfb49a06566dc41df26f5862c1f
---
 vp9/encoder/vp9_firstpass.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e9250e25c0..08b68c93ee 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -3495,7 +3495,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   const int show_idx = cm->current_video_frame;
 
   if (cpi->common.current_frame_coding_index == 0) {
-    VP9_COMMON *cm = &cpi->common;
     const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
         &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
     if (codec_status != VPX_CODEC_OK) {

From 89765feb99dc63826be72eb600b011fc9f75d61e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Feb 2023 15:47:10 -0800
Subject: [PATCH 626/926] vp9_mcomp.c: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I6d7d96ffb3e388eac94d1d41563f7079a8297c85
---
 vp9/encoder/vp9_mcomp.c | 55 +++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 1f08aa5de7..207eb43949 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -163,8 +163,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   do {                                                                         \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
       int64_t tmpmse;                                                          \
-      const MV mv = { r, c };                                                  \
-      const MV ref_mv = { rr, rc };                                            \
+      const MV cb_mv = { r, c };                                               \
+      const MV cb_ref_mv = { rr, rc };                                         \
       if (second_pred == NULL) {                                               \
         thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
                            src_stride, &sse);                                  \
@@ -173,7 +173,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
                             src_stride, &sse, second_pred);                    \
       }                                                                        \
       tmpmse = thismse;                                                        \
-      tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);     \
+      tmpmse +=                                                                \
+          mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit);     \
       if (tmpmse >= INT_MAX) {                                                 \
         v = INT_MAX;                                                           \
       } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
@@ -192,15 +193,16 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 #define CHECK_BETTER(v, r, c)                                                  \
   do {                                                                         \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-      const MV mv = { r, c };                                                  \
-      const MV ref_mv = { rr, rc };                                            \
+      const MV cb_mv = { r, c };                                               \
+      const MV cb_ref_mv = { rr, rc };                                         \
       if (second_pred == NULL)                                                 \
         thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
                            src_stride, &sse);                                  \
       else                                                                     \
         thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
                             src_stride, &sse, second_pred);                    \
-      if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +     \
+      if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost,                \
+                           error_per_bit) +                                    \
                thismse) < besterr) {                                           \
         besterr = v;                                                           \
         br = r;                                                                \
@@ -686,13 +688,14 @@ static int accurate_sub_pel_search(
   do {                                                                        \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
       int64_t tmpmse;                                                         \
-      const MV mv = { r, c };                                                 \
-      const MV ref_mv = { rr, rc };                                           \
-      thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z,    \
+      const MV cb_mv = { r, c };                                              \
+      const MV cb_ref_mv = { rr, rc };                                        \
+      thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
                                         src_stride, y, y_stride, second_pred, \
                                         w, h, &sse);                          \
       tmpmse = thismse;                                                       \
-      tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);    \
+      tmpmse +=                                                               \
+          mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit);    \
       if (tmpmse >= INT_MAX) {                                                \
         v = INT_MAX;                                                          \
       } else if ((v = (uint32_t)tmpmse) < besterr) {                          \
@@ -711,12 +714,13 @@ static int accurate_sub_pel_search(
 #define CHECK_BETTER1(v, r, c)                                                \
   do {                                                                        \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
-      const MV mv = { r, c };                                                 \
-      const MV ref_mv = { rr, rc };                                           \
-      thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z,    \
+      const MV cb_mv = { r, c };                                              \
+      const MV cb_ref_mv = { rr, rc };                                        \
+      thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
                                         src_stride, y, y_stride, second_pred, \
                                         w, h, &sse);                          \
-      if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +    \
+      if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost,               \
+                           error_per_bit) +                                   \
                thismse) < besterr) {                                          \
         besterr = v;                                                          \
         br = r;                                                               \
@@ -980,16 +984,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
   const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
   int br = best_mv->row;
   int bc = best_mv->col;
-  MV this_mv;
+  const MV mv = { br, bc };
   int i;
   unsigned int sse;
 
-  this_mv.row = br;
-  this_mv.col = bc;
   cost_list[0] =
-      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
                  in_what->stride, &sse) +
-      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+      mvsad_err_cost(x, &mv, &fcenter_mv, sadpb);
   if (check_bounds(&x->mv_limits, br, bc, 1)) {
     for (i = 0; i < 4; i++) {
       const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
@@ -1170,6 +1172,9 @@ static int vp9_pattern_search(
     } while (s--);
   }
 
+  best_mv->row = br;
+  best_mv->col = bc;
+
   // Returns the one-away integer pel sad values around the best as follows:
   // cost_list[0]: cost at the best integer pel
   // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
@@ -1177,11 +1182,8 @@ static int vp9_pattern_search(
   // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
   // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
   if (cost_list) {
-    const MV best_mv = { br, bc };
-    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
+    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list);
   }
-  best_mv->row = br;
-  best_mv->col = bc;
   return bestsad;
 }
 
@@ -2321,17 +2323,16 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   // TODO(jingning): Implement integral projection functions for high bit-depth
   // setting and remove this part of code.
   if (xd->bd != 8) {
-    unsigned int this_sad;
+    const unsigned int sad = cpi->fn_ptr[bsize].sdf(
+        x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride);
     tmp_mv->row = 0;
     tmp_mv->col = 0;
-    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                      xd->plane[0].pre[0].buf, ref_stride);
 
     if (scaled_ref_frame) {
       int i;
       for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
     }
-    return this_sad;
+    return sad;
   }
 #endif
 

From 66885a69ffdec30f0142dbff20568ebeb249107e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 23 Mar 2023 19:28:48 -0700
Subject: [PATCH 627/926] svc_datarate_test: clear -Wshadow warning

rename class member from ref_frame_config to the correct style:
ref_frame_config_.

Bug: webm:1793
Change-Id: Idaf49de6d724014adee75f81efe974b2031241ba
---
 test/svc_datarate_test.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 484252ca43..d571f50860 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -256,13 +256,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       temporal_layer_id_ = layer_id.temporal_layer_id;
       for (int i = 0; i < number_spatial_layers_; i++) {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
-        ref_frame_config.duration[i] = 1;
+        ref_frame_config_.duration[i] = 1;
       }
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config,
+                                  number_spatial_layers_, 0, &ref_frame_config_,
                                   1);
-      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
     }
 
     if (update_pattern_ && video->frame() >= 100) {
@@ -277,13 +277,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       temporal_layer_id_ = layer_id.temporal_layer_id;
       for (int i = 0; i < number_spatial_layers_; i++) {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
-        ref_frame_config.duration[i] = 1;
+        ref_frame_config_.duration[i] = 1;
       }
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config,
+                                  number_spatial_layers_, 0, &ref_frame_config_,
                                   0);
-      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
     }
 
     if (change_bitrate_ && video->frame() == 200) {
@@ -611,7 +611,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   bool single_layer_resize_;
   unsigned int top_sl_width_;
   unsigned int top_sl_height_;
-  vpx_svc_ref_frame_config_t ref_frame_config;
+  vpx_svc_ref_frame_config_t ref_frame_config_;
   int update_pattern_;
   bool change_bitrate_;
   vpx_codec_pts_t last_pts_ref_;

From 8f17482e824e021e16c91bcf2b034830639ad3bb Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 7 Mar 2023 15:29:37 -0800
Subject: [PATCH 628/926] vp9_rdopt,block_rd_txfm: fix clang-tidy warning

argument name 'recon' in comment does not match parameter name
'out_recon'.

https://clang.llvm.org/extra/clang-tidy/checks/bugprone/argument-comment.html

+ normalize similar calls, using /*var=*/NULL to better match the style
  guidelines

https://google.github.io/styleguide/cppguide.html#Function_Argument_Comments

Change-Id: I089591317f7138965735f737c1536a8b16fcd4e4
---
 vp9/encoder/vp9_rdopt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 76d545cd96..1e40e5378b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -743,7 +743,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     }
     if (x->block_tx_domain) {
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, /*recon =*/0, sse_calc_done);
+                 tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done);
     } else {
       const struct macroblock_plane *const p = &x->plane[plane];
       const int src_stride = p->src.stride;
@@ -1396,7 +1396,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
     mic->mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, best_rd, /*recon = */ 0);
+                    bsize, best_rd, /*recon=*/NULL);
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
@@ -1449,7 +1449,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
     txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
                      plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
-                     /*recon = */ 0);
+                     /*recon=*/NULL);
     if (pnrate == INT_MAX) {
       is_cost_valid = 0;
       break;

From b4d154c9486afdd010b61ba24c72655aa761a9e7 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Tue, 21 Mar 2023 13:00:25 +0530
Subject: [PATCH 629/926] Add AVX2 for convolve vertical filter for block width
 4

Introduced AVX2 intrinsic to compute convolve vertical for
w = 4 case. This is a bit-exact change.

                 Instruction Count
cpu   Resolution   Reduction(%)
 0       LOWRES2      0.364
 0       MIDRES2      0.236
 0        HDRES2      0.162
 0       Average      0.254

Change-Id: I413f58aa6333a6f2421d4c10d49dec01e55b2098
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 185 +++++++++++++++++++++-
 1 file changed, 183 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 37ef59f36c..3b5ff04ee9 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -1184,8 +1184,190 @@ static void vpx_filter_block1d4_h8_avx2(
   }
 }
 
+static void vpx_filter_block1d4_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[8];
+  __m128i r1[10];
+  __m128i s[11];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by four
+  const ptrdiff_t src_stride = src_pitch << 2;
+  const ptrdiff_t out_stride = out_pitch << 2;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 0x01));
+
+  shuffle_filter_avx2(filter, f);
+
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00
+  r1[0] = _mm_unpacklo_epi32(s[0], s[1]);
+
+  // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10
+  r1[1] = _mm_unpacklo_epi32(s[1], s[2]);
+
+  // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20
+  r1[2] = _mm_unpacklo_epi32(s[2], s[3]);
+
+  // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30
+  r1[3] = _mm_unpacklo_epi32(s[3], s[4]);
+
+  // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40
+  r1[4] = _mm_unpacklo_epi32(s[4], s[5]);
+
+  // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50
+  r1[5] = _mm_unpacklo_epi32(s[5], s[6]);
+
+  // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02
+  // r01 r00
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1);
+
+  // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12
+  // r11 r10
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1);
+
+  // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22
+  // r21 r20
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1);
+
+  // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32
+  // r31 r30
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1);
+
+  // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
+  // r00|
+  ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+  // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
+  // r20|
+  ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+  // Process 4 rows at a time
+  while (y >= 4) {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+    s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
+    s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
+
+    // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+    r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+
+    // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+    r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+
+    // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80
+    r1[8] = _mm_unpacklo_epi32(s[8], s[9]);
+
+    // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90
+    r1[9] = _mm_unpacklo_epi32(s[9], s[10]);
+
+    // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43
+    // r42 r41 r40
+    r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1);
+
+    // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53
+    // r52 r51 r50
+    r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1);
+
+    // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63
+    // r62 r61 r60
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1);
+
+    // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81
+    // r80|r73 r72 r71 r70
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1);
+
+    // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50
+    // r40|
+    ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+    // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73
+    // r63....r70 r60|
+    ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+
+    ss[0] = convolve8_16_avx2(ss, f);
+
+    // r3 r2 r3 r2 r1 r0 r1 r0
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    mm256_storeu2_epi32((__m128i *const)output_ptr,
+                        (__m128i *const)(output_ptr + (2 * out_pitch)), ss);
+
+    ss[0] = _mm256_srli_si256(ss[0], 4);
+
+    mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)),
+                        (__m128i *const)(output_ptr + (3 * out_pitch)), ss);
+
+    output_ptr += out_stride;
+
+    ss[0] = ss[2];
+    ss[1] = ss[3];
+
+    s[6] = s[10];
+
+    r1[4] = r1[8];
+    r1[5] = r1[9];
+
+    y -= 4;
+  }
+
+  // Process 2 rows
+  if (y == 2) {
+    __m128i ss1[4], f1[4];
+
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    f1[0] = _mm256_castsi256_si128(f[0]);
+    f1[1] = _mm256_castsi256_si128(f[1]);
+    f1[2] = _mm256_castsi256_si128(f[2]);
+    f1[3] = _mm256_castsi256_si128(f[3]);
+
+    // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+    r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+
+    // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+    r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+
+    // r23 r13....r20 r10|r13 r03....r10 r00
+    ss1[0] = _mm256_castsi256_si128(ss[0]);
+
+    // r43 r33....r40 r30|r33 r23....r30 r20
+    ss1[1] = _mm256_castsi256_si128(ss[1]);
+
+    // r63 r53....r60 r50|r53 r43....r50 r40
+    ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]);
+
+    // r83 r73....r80 r70|r73 r63....r70 r60
+    ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]);
+
+    ss1[0] = convolve8_8_ssse3(ss1, f1);
+
+    // r1 r0 r1 r0
+    ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]);
+
+    // Save first row 4 values
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+    output_ptr += out_pitch;
+
+    ss1[0] = _mm_srli_si128(ss1[0], 4);
+    // Save second row 4 values
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+  }
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
@@ -1209,7 +1391,6 @@ filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3

From 25825f6a78a267f99c4c6ba7988fc4d79c8cb19d Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 9 Mar 2023 23:46:31 +0000
Subject: [PATCH 630/926] Allow non-uniform above array in highbd d45 predictor
 Neon impl

The existing implementation doesn't appear to manifest as a failure in
any of the predictor or MD5 tests, but it does rely on the predictor
tests filling the second `bs` elements of the `above` input array with
copies of `above[bs - 1]` in order to match the C implementation.

This patch adjusts the Neon implementation to correctly match the C
implementation in the case where the elements of the `above` array all
differ.

Performance of the predictor is mostly unchanged, except for the 16x16
block size where it appears to have gotten marginally faster across most
compiler/micro-architecture combinations.

Bug: webm:1797
Change-Id: Iac166d6047316c0382e0f2790ce780fc99674b43
---
 vpx_dsp/arm/highbd_intrapred_neon.c | 275 +++++++++++++++-------------
 1 file changed, 144 insertions(+), 131 deletions(-)

diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 503900915d..05c9c7f196 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -289,166 +289,179 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8_t ABCDEFGH = vld1q_u16(above);
-  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
-  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
-  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
-  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
-  const uint16x4_t avg2_low = vget_low_u16(avg2);
-  const uint16x4_t avg2_high = vget_high_u16(avg2);
-  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
-  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
-  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+  uint16x8_t a0, a1, a2, d0;
+  uint16_t a7;
   (void)left;
   (void)bd;
-  vst1_u16(dst, avg2_low);
-  dst += stride;
-  vst1_u16(dst, r1);
-  dst += stride;
-  vst1_u16(dst, r2);
-  dst += stride;
-  vst1_u16(dst, r3);
-  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
-}
 
-static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
-                               const uint16x8_t above_right, uint16x8_t *row) {
-  *row = vextq_u16(*row, above_right, 1);
-  vst1q_u16(*dst, *row);
-  *dst += stride;
+  a0 = vld1q_u16(above);
+  a7 = above[7];
+
+  // [ above[1], ..., above[6], x, x ]
+  a1 = vextq_u16(a0, a0, 1);
+  // [ above[2], ..., above[7], x, x ]
+  a2 = vextq_u16(a0, a0, 2);
+
+  // d0[0] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[5] = AVG3(above[5], above[6], above[7]);
+  // d0[6] = x (don't care)
+  // d0[7] = x (don't care)
+  d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  // We want:
+  // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
+  // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
+  // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
+  // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+  vst1_u16(dst + 0 * stride, vget_low_u16(d0));
+  vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1)));
+  vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2)));
+  vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3)));
+
+  // We stored d0[6] above, so fixup into above[7].
+  dst[3 * stride + 3] = a7;
 }
 
 void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8_t A0 = vld1q_u16(above);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
-  const uint16x8_t A1 = vld1q_u16(above + 1);
-  const uint16x8_t A2 = vld1q_u16(above + 2);
-  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
-  uint16x8_t row = vrhaddq_u16(avg1, A1);
+  uint16x8_t ax0, a0, a1, a7, d0;
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row);
-  dst += stride;
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  vst1q_u16(dst, above_right);
-}
-
-static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
-                                const uint16x8_t above_right, uint16x8_t *row_0,
-                                uint16x8_t *row_1) {
-  *row_0 = vextq_u16(*row_0, *row_1, 1);
-  *row_1 = vextq_u16(*row_1, above_right, 1);
-  vst1q_u16(*dst, *row_0);
-  *dst += 8;
-  vst1q_u16(*dst, *row_1);
-  *dst += stride - 8;
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_dup_u16(above + 7);
+
+  // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+  // shift in above[7] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[7] = AVG3(above[6], above[7], above[8]);
+  d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[7].
+  vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1));
+  vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2));
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7));
+  vst1q_u16(dst + 7 * stride, a7);
 }
 
 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  const uint16x8_t A0_0 = vld1q_u16(above);
-  const uint16x8_t A0_1 = vld1q_u16(above + 8);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
-  const uint16x8_t A1_0 = vld1q_u16(above + 1);
-  const uint16x8_t A1_1 = vld1q_u16(above + 9);
-  const uint16x8_t A2_0 = vld1q_u16(above + 2);
-  const uint16x8_t A2_1 = vld1q_u16(above + 10);
-  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
-  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
-  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
-  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2];
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row_0);
-  vst1q_u16(dst + 8, row_1);
-  dst += stride;
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  vst1q_u16(dst, above_right);
-  vst1q_u16(dst + 8, above_right);
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a15 = vld1q_dup_u16(above + 15);
+
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  // We have one unused lane here to leave room to shift in above[15] in the
+  // last lane:
+  // d0[0][1] = x (don't care)
+  // d0[0][1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[0][7] = AVG3(above[6], above[7], above[8]);
+  // d0[1][0] = AVG3(above[7], above[8], above[9]);
+  // ...
+  // d0[1][7] = AVG3(above[14], above[15], above[16]);
+  d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+  d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+
+  // Incrementally shift in duplicates of above[15].
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7));
+  vst1q_u16(dst + 7 * stride + 0, d0[1]);
+  vst1q_u16(dst + 7 * stride + 8, a15);
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1));
+  vst1q_u16(dst + 8 * stride + 8, a15);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2));
+  vst1q_u16(dst + 9 * stride + 8, a15);
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3));
+  vst1q_u16(dst + 10 * stride + 8, a15);
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4));
+  vst1q_u16(dst + 11 * stride + 8, a15);
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5));
+  vst1q_u16(dst + 12 * stride + 8, a15);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6));
+  vst1q_u16(dst + 13 * stride + 8, a15);
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7));
+  vst1q_u16(dst + 14 * stride + 8, a15);
+  vst1q_u16(dst + 15 * stride + 0, a15);
+  vst1q_u16(dst + 15 * stride + 8, a15);
 }
 
 void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  const uint16x8_t A0_0 = vld1q_u16(above);
-  const uint16x8_t A0_1 = vld1q_u16(above + 8);
-  const uint16x8_t A0_2 = vld1q_u16(above + 16);
-  const uint16x8_t A0_3 = vld1q_u16(above + 24);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
-  const uint16x8_t A1_0 = vld1q_u16(above + 1);
-  const uint16x8_t A1_1 = vld1q_u16(above + 9);
-  const uint16x8_t A1_2 = vld1q_u16(above + 17);
-  const uint16x8_t A1_3 = vld1q_u16(above + 25);
-  const uint16x8_t A2_0 = vld1q_u16(above + 2);
-  const uint16x8_t A2_1 = vld1q_u16(above + 10);
-  const uint16x8_t A2_2 = vld1q_u16(above + 18);
-  const uint16x8_t A2_3 = vld1q_u16(above + 26);
-  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
-  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
-  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
-  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
-  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
-  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
-  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
-  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+  uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4];
   int i;
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row_0);
-  dst += 8;
-  vst1q_u16(dst, row_1);
-  dst += 8;
-  vst1q_u16(dst, row_2);
-  dst += 8;
-  vst1q_u16(dst, row_3);
-  dst += stride - 24;
-
-  for (i = 0; i < 30; ++i) {
-    row_0 = vextq_u16(row_0, row_1, 1);
-    row_1 = vextq_u16(row_1, row_2, 1);
-    row_2 = vextq_u16(row_2, row_3, 1);
-    row_3 = vextq_u16(row_3, above_right, 1);
-    vst1q_u16(dst, row_0);
-    dst += 8;
-    vst1q_u16(dst, row_1);
-    dst += 8;
-    vst1q_u16(dst, row_2);
-    dst += 8;
-    vst1q_u16(dst, row_3);
-    dst += stride - 24;
-  }
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a31 = vld1q_dup_u16(above + 31);
 
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+  d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+  d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16);
+  d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24);
+
+  for (i = 0; i < 32; ++i) {
+    d0[0] = vextq_u16(d0[0], d0[1], 1);
+    d0[1] = vextq_u16(d0[1], d0[2], 1);
+    d0[2] = vextq_u16(d0[2], d0[3], 1);
+    d0[3] = vextq_u16(d0[3], a31, 1);
+    vst1q_u16(dst + 0, d0[0]);
+    vst1q_u16(dst + 8, d0[1]);
+    vst1q_u16(dst + 16, d0[2]);
+    vst1q_u16(dst + 24, d0[3]);
+    dst += stride;
+  }
 }
 
 // -----------------------------------------------------------------------------

From 3eb3781589d30874634cab8952dec4ea883eb82a Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 17 Mar 2023 17:59:26 +0000
Subject: [PATCH 631/926] Allow non-uniform above array in d45 predictor Neon
 impl

The existing implementation doesn't appear to manifest as a failure in
any of the predictor or MD5 tests, but it does rely on the predictor
tests filling the second `bs` elements of the `above` input array with
copies of `above[bs - 1]` in order to match the C implementation.

This patch adjusts the Neon implementation to correctly match the C
implementation in the case where the elements of the `above` array all
differ.

Performance of the predictor is mostly unchanged, except for the 32x32
block size where it appears to have gotten about 40% faster when
compiled with clang-15.

Bug: webm:1797
Change-Id: Iaad58e77c5467307a3c80d6989b7cf2988e09311
---
 vpx_dsp/arm/intrapred_neon.c | 269 ++++++++++++++++++++++-------------
 1 file changed, 174 insertions(+), 95 deletions(-)

diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 892310f151..7c225f6b71 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -263,123 +263,202 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t ABCDEFGH = vld1_u8(above);
-  const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8);
-  const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16);
-  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
-  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
-  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  uint8x8_t a0, a1, a2, d0;
+  uint8_t a7;
   (void)left;
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-  vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7);
-}
 
-static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride,
-                               const uint8x8_t above_right, uint8x8_t *row) {
-  *row = vext_u8(*row, above_right, 1);
-  vst1_u8(*dst, *row);
-  *dst += stride;
+  a0 = vld1_u8(above);
+  a7 = above[7];
+
+  // [ above[1], ..., above[6], x, x ]
+  a1 = vext_u8(a0, a0, 1);
+  // [ above[2], ..., above[7], x, x ]
+  a2 = vext_u8(a0, a0, 2);
+
+  // d0[0] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[5] = AVG3(above[5], above[6], above[7]);
+  // d0[6] = x (don't care)
+  // d0[7] = x (don't care)
+  d0 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+  // We want:
+  // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
+  // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
+  // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
+  // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1));
+  store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2));
+  store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3));
+
+  // We stored d0[6] above, so fixup into above[7].
+  dst[3 * stride + 3] = a7;
 }
 
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t A0 = vld1_u8(above);
-  const uint8x8_t above_right = vdup_lane_u8(A0, 7);
-  const uint8x8_t A1 = vext_u8(A0, above_right, 1);
-  const uint8x8_t A2 = vext_u8(A0, above_right, 2);
-  const uint8x8_t avg1 = vhadd_u8(A0, A2);
-  uint8x8_t row = vrhadd_u8(avg1, A1);
+  uint8x8_t ax0, a0, a1, a7, d0;
   (void)left;
 
-  vst1_u8(dst, row);
-  dst += stride;
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  vst1_u8(dst, above_right);
-}
-
-static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride,
-                                const uint8x16_t above_right, uint8x16_t *row) {
-  *row = vextq_u8(*row, above_right, 1);
-  vst1q_u8(*dst, *row);
-  *dst += stride;
+  a0 = vld1_u8(above + 0);
+  a1 = vld1_u8(above + 1);
+  a7 = vld1_dup_u8(above + 7);
+
+  // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+  // shift in above[7] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vext_u8(a0, a0, 7);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[7] = AVG3(above[6], above[7], above[8]);
+  d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[7].
+  vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1));
+  vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5));
+  vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7));
+  vst1_u8(dst + 7 * stride, a7);
 }
 
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0 = vld1q_u8(above);
-  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7);
-  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
-  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
-  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
-  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  uint8x16_t ax0, a0, a1, a15, d0;
   (void)left;
 
-  vst1q_u8(dst, row);
-  dst += stride;
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  vst1q_u8(dst, above_right);
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a15 = vld1q_dup_u8(above + 15);
+
+  // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+  // shift in above[15] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[14] ]
+  ax0 = vextq_u8(a0, a0, 15);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[15] = AVG3(above[14], above[15], above[16]);
+  d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[15].
+  vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1));
+  vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15));
+  vst1q_u8(dst + 15 * stride, a15);
 }
 
 void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0_0 = vld1q_u8(above);
-  const uint8x16_t A0_1 = vld1q_u8(above + 16);
-  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7);
-  const uint8x16_t A1_0 = vld1q_u8(above + 1);
-  const uint8x16_t A1_1 = vld1q_u8(above + 17);
-  const uint8x16_t A2_0 = vld1q_u8(above + 2);
-  const uint8x16_t A2_1 = vld1q_u8(above + 18);
-  const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0);
-  const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1);
-  uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0);
-  uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1);
-  int i;
+  uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2];
   (void)left;
 
-  vst1q_u8(dst, row_0);
-  dst += 16;
-  vst1q_u8(dst, row_1);
-  dst += stride - 16;
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  a17 = vld1q_u8(above + 17);
+  a31 = vld1q_dup_u8(above + 31);
 
-  for (i = 0; i < 30; ++i) {
-    row_0 = vextq_u8(row_0, row_1, 1);
-    row_1 = vextq_u8(row_1, above_right, 1);
-    vst1q_u8(dst, row_0);
-    dst += 16;
-    vst1q_u8(dst, row_1);
-    dst += stride - 16;
-  }
+  // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+  // shift in above[15] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[14] ]
+  ax0 = vextq_u8(a0, a0, 15);
 
-  vst1q_u8(dst, above_right);
-  dst += 16;
-  vst1q_u8(dst, row_1);
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[15] = AVG3(above[14], above[15], above[16]);
+  d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+  d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[15].
+  vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1));
+  vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1));
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15));
+  vst1q_u8(dst + 15 * stride + 0, d0[1]);
+  vst1q_u8(dst + 15 * stride + 16, a31);
+
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1));
+  vst1q_u8(dst + 16 * stride + 16, a31);
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2));
+  vst1q_u8(dst + 17 * stride + 16, a31);
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3));
+  vst1q_u8(dst + 18 * stride + 16, a31);
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4));
+  vst1q_u8(dst + 19 * stride + 16, a31);
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5));
+  vst1q_u8(dst + 20 * stride + 16, a31);
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6));
+  vst1q_u8(dst + 21 * stride + 16, a31);
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7));
+  vst1q_u8(dst + 22 * stride + 16, a31);
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8));
+  vst1q_u8(dst + 23 * stride + 16, a31);
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9));
+  vst1q_u8(dst + 24 * stride + 16, a31);
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10));
+  vst1q_u8(dst + 25 * stride + 16, a31);
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11));
+  vst1q_u8(dst + 26 * stride + 16, a31);
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12));
+  vst1q_u8(dst + 27 * stride + 16, a31);
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13));
+  vst1q_u8(dst + 28 * stride + 16, a31);
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14));
+  vst1q_u8(dst + 29 * stride + 16, a31);
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15));
+  vst1q_u8(dst + 30 * stride + 16, a31);
+  vst1q_u8(dst + 31 * stride + 0, a31);
+  vst1q_u8(dst + 31 * stride + 16, a31);
 }
 
 // -----------------------------------------------------------------------------

From 911d6e165eb19e03ec1532fa20098b10ad402e39 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 17 Mar 2023 19:55:17 +0000
Subject: [PATCH 632/926] Allow non-uniform above array in d63 predictor Neon
 impl

The existing standard bitdepth implementation doesn't appear to manifest
as a failure in any of the predictor or MD5 tests, but it does rely on
the predictor tests filling the second `bs` elements of the `above`
input array with copies of `above[bs - 1]` in order to match the C
implementation.

This patch adjusts the Neon implementation to correctly match the C
implementation in the case where the elements of the `above` array all
differ.

The geomean of performance for the predictor is approximately a 2%
slowdown compared to the previous vectorized implementation. This is
still considerably faster than the unspecialized naive C implementation.

Bug: webm:1797
Change-Id: I8fb00a154288d54b24a72a7ff63c816bdcf3aca3
---
 vpx_dsp/arm/intrapred_neon.c | 174 +++++++++++++++++++----------------
 1 file changed, 94 insertions(+), 80 deletions(-)

diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 7c225f6b71..3d117fa938 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -499,12 +499,16 @@ void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
 
   vst1_u8(dst + 0 * stride, d0);
   vst1_u8(dst + 1 * stride, d1);
-  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 1));
-  vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 1));
-  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 2));
-  vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 2));
-  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 3));
-  vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 3));
+
+  d0 = vext_u8(d0, d0, 7);
+  d1 = vext_u8(d1, d1, 7);
+
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4));
+  vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4));
 }
 
 void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
@@ -522,20 +526,24 @@ void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 
   vst1q_u8(dst + 0 * stride, d0);
   vst1q_u8(dst + 1 * stride, d1);
-  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 1));
-  vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 1));
-  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 2));
-  vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 2));
-  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 3));
-  vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 3));
-  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 4));
-  vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 4));
-  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 5));
-  vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 5));
-  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 6));
-  vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 6));
-  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 7));
-  vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 7));
+
+  d0 = vextq_u8(d0, d0, 15);
+  d1 = vextq_u8(d1, d1, 15);
+
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8));
 }
 
 void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
@@ -560,66 +568,72 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   vst1q_u8(dst + 0 * stride + 16, d0_hi);
   vst1q_u8(dst + 1 * stride + 0, d1_lo);
   vst1q_u8(dst + 1 * stride + 16, d1_hi);
-  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 1));
-  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 1));
-  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 1));
-  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 1));
-  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
-  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 2));
-  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
-  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 2));
-  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
-  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 3));
-  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
-  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 3));
-  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
-  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 4));
-  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
-  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 4));
-  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
-  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 5));
-  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
-  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 5));
-  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
-  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 6));
-  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
-  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 6));
-  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
-  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 7));
-  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
-  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 7));
-  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
-  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 8));
-  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
-  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 8));
-  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
-  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 9));
-  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
-  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 9));
-  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
-  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 10));
-  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
-  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 10));
-  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
-  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 11));
-  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
-  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 11));
-  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
-  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 12));
-  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
-  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 12));
-  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
-  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 13));
-  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
-  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 13));
-  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
-  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 14));
-  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
-  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 14));
-  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
-  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_hi, a31, 15));
-  vst1q_u8(dst + 31 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
-  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_hi, a31, 15));
+
+  d0_hi = vextq_u8(d0_lo, d0_hi, 15);
+  d0_lo = vextq_u8(d0_lo, d0_lo, 15);
+  d1_hi = vextq_u8(d1_lo, d1_hi, 15);
+  d1_lo = vextq_u8(d1_lo, d1_lo, 15);
+
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15));
+  vst1q_u8(dst + 30 * stride + 0, d0_hi);
+  vst1q_u8(dst + 30 * stride + 16, a31);
+  vst1q_u8(dst + 31 * stride + 0, d1_hi);
+  vst1q_u8(dst + 31 * stride + 16, a31);
 }
 
 // -----------------------------------------------------------------------------

From 100ca0356ddf67e92da35699d92bc180429d0bc1 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 17 Mar 2023 20:00:24 +0000
Subject: [PATCH 633/926] Randomize second half of above_row_ in intrapred
 tests for Neon

The existing tests duplicate `above_row_[block_size - 1]` after the
first `block_size` elements, which can lead to tests incorrectly passing
due to differing behaviour when calculating the average for the last
elements of the output.

This change adjusts the above array setup to be fully random instead,
allowing us to catch such issues here rather than in other larger tests
like the external MD5 tests.

It doesn't appear that other architectures are fully clean with this
change so restrict it to just Neon for now until they are fixed.

Bug: webm:1797
Change-Id: If83ff1adbf1e8d30f2a92474d7186c65840a5d0b
---
 test/vp9_intrapred_test.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index cec9031618..6de7cf8d0f 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -55,6 +55,21 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
     ref_dst_ = ref_dst;
     int error_count = 0;
     for (int i = 0; i < count_test_block; ++i) {
+      // TODO(webm:1797): Some of the optimised predictor implementations rely
+      // on the trailing half of the above_row_ being a copy of the final
+      // element, however relying on this in some cases can cause the MD5 tests
+      // to fail. We have fixed all of these cases for Neon, so fill the whole
+      // of above_row_ randomly.
+#if HAVE_NEON
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x < 2 * block_size; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+#else
       // Fill edges with random data, try first with saturated values.
       for (int x = -1; x < block_size; x++) {
         if (i == 0) {
@@ -66,6 +81,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
       for (int x = block_size; x < 2 * block_size; x++) {
         above_row_[x] = above_row_[block_size - 1];
       }
+#endif
       for (int y = 0; y < block_size; y++) {
         if (i == 0) {
           left_col_[y] = mask_;

From 972149cafeb71d6f08df89e91a0130d6a38c4b15 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 28 Mar 2023 10:09:16 -0400
Subject: [PATCH 634/926] svc: Fix a case where target bandwidth is 0

Bug: webrtc:15033
Change-Id: I28636de66842671b03284408186c4c18254109a5
---
 vp9/encoder/vp9_svc_layercontext.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index c60445cba5..ac8b09cf31 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -220,7 +220,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
         RATE_CONTROL *const lrc = &lc->rc;
 
         lc->spatial_layer_target_bandwidth = spatial_layer_target;
-        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+        if (target_bandwidth != 0) {
+          bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+        }
         lrc->starting_buffer_level =
             (int64_t)(rc->starting_buffer_level * bitrate_alloc);
         lrc->optimal_buffer_level =

From 4cf9819282aa123e8b126731ef5629ee5144cd86 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 21 Mar 2023 14:31:50 +0000
Subject: [PATCH 635/926] Avoid LD2/ST2 instructions in
 vpx_dc_predictor_32x32_neon

The LD2 and ST2 instructions are useful if we are dealing with
interleaved data (e.g. real/imag components of complex numbers), but for
simply loading or storing larger quantities of data it is preferable to
simply use two of the normal load/store instructions.

This patch replaces such occurrences in vpx_dc_predictor_32x32_neon and
related functions.

With Clang-15 this speeds up this function by 10-30% depending on the
micro-architecture being benchmarked on. With GCC-12 this speeds up the
function by 40-60% depending on the micro-architecture being benchmarked
on.

Change-Id: I670dc37908aa238f360104efd74d6c2108ecf945
---
 vpx_dsp/arm/intrapred_neon.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 892310f151..b7f2a11ca2 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -193,9 +193,10 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 // DC 32x32
 
 static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
-  const uint8x16x2_t r = vld2q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
-  const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
+  const uint8x16_t r0 = vld1q_u8(ref + 0);
+  const uint8x16_t r1 = vld1q_u8(ref + 16);
+  const uint16x8_t p0 = vpaddlq_u8(r0);
+  const uint16x8_t p1 = vpaddlq_u8(r1);
   const uint16x8_t p2 = vaddq_u16(p0, p1);
   uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
   sum = vpadd_u16(sum, sum);
@@ -204,23 +205,24 @@ static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
 
 static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
                                   const uint8x8_t dc) {
-  uint8x16x2_t dc_dup;
+  uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
-
   for (i = 0; i < 32; ++i, dst += stride) {
-    vst2q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc_dup);
+    vst1q_u8(dst + 16, dc_dup);
   }
 }
 
 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x16x2_t a = vld2q_u8(above);
-  const uint8x16x2_t l = vld2q_u8(left);
-  const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
-  const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
-  const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
-  const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
+  const uint8x16_t a0 = vld1q_u8(above + 0);
+  const uint8x16_t a1 = vld1q_u8(above + 16);
+  const uint8x16_t l0 = vld1q_u8(left + 0);
+  const uint8x16_t l1 = vld1q_u8(left + 16);
+  const uint16x8_t pa0 = vpaddlq_u8(a0);
+  const uint16x8_t pl0 = vpaddlq_u8(l0);
+  const uint16x8_t pa1 = vpaddlq_u8(a1);
+  const uint16x8_t pl1 = vpaddlq_u8(l1);
   const uint16x8_t pa = vaddq_u16(pa0, pa1);
   const uint16x8_t pl = vaddq_u16(pl0, pl1);
   const uint16x8_t pal = vaddq_u16(pa, pl);

From 83def747ff316d283c949458a4b890b23e5e0b8b Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Wed, 22 Mar 2023 08:44:26 +0000
Subject: [PATCH 636/926] Avoid interleaving loads/stores in Neon for highbd dc
 predictor

The interleaving load/store instructions (LD2/LD3/LD4 and ST2/ST3/ST4)
are useful if we are dealing with interleaved data (e.g. real/imag
components of complex numbers), but for simply loading or storing larger
quantities of data it is preferable to simply use two or more of the
normal load/store instructions.

This patch replaces such occurrences in the two larger block sizes:
vpx_highbd_dc_predictor_16x16_neon, vpx_highbd_dc_predictor_32x32_neon,
and related helper functions.

Speedups over the original Neon code (higher is better):

Microarch.  | Compiler | Block | Speedup
Neoverse N1 |  LLVM 15 | 16x16 |    1.25
Neoverse N1 |  LLVM 15 | 32x32 |    1.13
Neoverse N1 |   GCC 12 | 16x16 |    1.56
Neoverse N1 |   GCC 12 | 32x32 |    1.52
Neoverse V1 |  LLVM 15 | 16x16 |    1.63
Neoverse V1 |  LLVM 15 | 32x32 |    1.08
Neoverse V1 |   GCC 12 | 16x16 |    1.59
Neoverse V1 |   GCC 12 | 32x32 |    1.37

Change-Id: If5ec220aba9dd19785454eabb0f3d6affec0cc8b
---
 vpx_dsp/arm/highbd_intrapred_neon.c | 61 +++++++++++++++++------------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 503900915d..b2aea14f7b 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -143,8 +143,9 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 // DC 16x16
 
 static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
-  const uint16x8x2_t ref_u16 = vld2q_u16(ref);
-  const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
+  const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
+  const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
+  const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
   uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
   sum = vpadd_u16(sum, sum);
   return vpadd_u16(sum, sum);
@@ -152,21 +153,23 @@ static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
 
 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
                                   const uint16x4_t dc) {
-  uint16x8x2_t dc_dup;
+  uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst2q_u16(dst, dc_dup);
+    vst1q_u16(dst + 0, dc_dup);
+    vst1q_u16(dst + 8, dc_dup);
   }
 }
 
 void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  const uint16x8x2_t a = vld2q_u16(above);
-  const uint16x8x2_t l = vld2q_u16(left);
-  const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
-  const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t a0 = vld1q_u16(above + 0);
+  const uint16x8_t a1 = vld1q_u16(above + 8);
+  const uint16x8_t l0 = vld1q_u16(left + 0);
+  const uint16x8_t l1 = vld1q_u16(left + 8);
+  const uint16x8_t pa = vaddq_u16(a0, a1);
+  const uint16x8_t pl = vaddq_u16(l0, l1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
   uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
   uint32x2_t sum;
@@ -211,9 +214,12 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 // DC 32x32
 
 static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
-  const uint16x8x4_t r = vld4q_u16(ref);
-  const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
-  const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+  const uint16x8_t r0 = vld1q_u16(ref + 0);
+  const uint16x8_t r1 = vld1q_u16(ref + 8);
+  const uint16x8_t r2 = vld1q_u16(ref + 16);
+  const uint16x8_t r3 = vld1q_u16(ref + 24);
+  const uint16x8_t p0 = vaddq_u16(r0, r1);
+  const uint16x8_t p1 = vaddq_u16(r2, r3);
   const uint16x8_t p2 = vaddq_u16(p0, p1);
   uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
   sum = vpadd_u16(sum, sum);
@@ -222,27 +228,32 @@ static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
 
 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
                                   const uint16x4_t dc) {
-  uint16x8x2_t dc_dup;
+  uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
-
   for (i = 0; i < 32; ++i) {
-    vst2q_u16(dst, dc_dup);
-    dst += 16;
-    vst2q_u16(dst, dc_dup);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, dc_dup);
+    vst1q_u16(dst + 8, dc_dup);
+    vst1q_u16(dst + 16, dc_dup);
+    vst1q_u16(dst + 24, dc_dup);
+    dst += stride;
   }
 }
 
 void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  const uint16x8x4_t a = vld4q_u16(above);
-  const uint16x8x4_t l = vld4q_u16(left);
-  const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
-  const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
-  const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
-  const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+  const uint16x8_t a0 = vld1q_u16(above + 0);
+  const uint16x8_t a1 = vld1q_u16(above + 8);
+  const uint16x8_t a2 = vld1q_u16(above + 16);
+  const uint16x8_t a3 = vld1q_u16(above + 24);
+  const uint16x8_t l0 = vld1q_u16(left + 0);
+  const uint16x8_t l1 = vld1q_u16(left + 8);
+  const uint16x8_t l2 = vld1q_u16(left + 16);
+  const uint16x8_t l3 = vld1q_u16(left + 24);
+  const uint16x8_t pa0 = vaddq_u16(a0, a1);
+  const uint16x8_t pa1 = vaddq_u16(a2, a3);
+  const uint16x8_t pl0 = vaddq_u16(l0, l1);
+  const uint16x8_t pl1 = vaddq_u16(l2, l3);
   const uint16x8_t pa = vaddq_u16(pa0, pa1);
   const uint16x8_t pl = vaddq_u16(pl0, pl1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);

From 9824167ad292ee42c9c97f3e6ce1d9ca90bf679f Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Wed, 22 Mar 2023 11:49:33 +0000
Subject: [PATCH 637/926] Avoid LD2/ST2 instructions in highbd v predictors in
 Neon

The interleaving load/store instructions (LD2/LD3/LD4 and ST2/ST3/ST4)
are useful if we are dealing with interleaved data (e.g. real/imag
components of complex numbers), but for simply loading or storing larger
quantities of data it is preferable to simply use the normal load/store
instructions.

This patch replaces such occurrences in the two larger block sizes:
vpx_highbd_v_predictor_16x16_neon and vpx_highbd_v_predictor_32x32_neon.

Change-Id: Ie4ffa298a2466ceaf893566fd0aefe3f66f439e4
---
 vpx_dsp/arm/highbd_intrapred_neon.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index b2aea14f7b..ec97094be6 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -2166,30 +2166,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row = vld2q_u16(above);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
   int i;
   (void)left;
   (void)bd;
 
-  for (i = 0; i < 16; i++, dst += stride) {
-    vst2q_u16(dst, row);
+  for (i = 0; i < 16; i++) {
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    dst += stride;
   }
 }
 
 void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row0 = vld2q_u16(above);
-  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
+  const uint16x8_t row2 = vld1q_u16(above + 16);
+  const uint16x8_t row3 = vld1q_u16(above + 24);
   int i;
   (void)left;
   (void)bd;
 
   for (i = 0; i < 32; i++) {
-    vst2q_u16(dst, row0);
-    dst += 16;
-    vst2q_u16(dst, row1);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    vst1q_u16(dst + 16, row2);
+    vst1q_u16(dst + 24, row3);
+    dst += stride;
   }
 }
 

From cf1efecebf0ed2e01bafea6804c98f80ab7e12e0 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 27 Mar 2023 14:31:40 +0100
Subject: [PATCH 638/926] Optimize Neon paths of high bitdepth SAD and SAD4d
 for 8xh blocks

For these block sizes there is no need to widen to 32-bits until the
final reduction, so use a single vabaq instead of vabd + vpadalq.

Change-Id: I9c19d620f7bb8b3a6b0bedd37789c03bb628b563
---
 vpx_dsp/arm/highbd_sad4d_neon.c | 31 ++++++++++++++++++-------------
 vpx_dsp/arm/highbd_sad_neon.c   |  7 +++----
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c
index f731d38cc1..280d2087f7 100644
--- a/vpx_dsp/arm/highbd_sad4d_neon.c
+++ b/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -48,12 +48,6 @@ static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
   vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }
 
-static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
-                             uint32x4_t *const sad_sum) {
-  uint16x8_t abs_diff = vabdq_u16(src, ref);
-  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
-}
-
 static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
                                          const uint8_t *const ref_ptr[4],
                                          int ref_stride, uint32_t res[4],
@@ -64,21 +58,32 @@ static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
   const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
   const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
 
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
 
   int i = 0;
   do {
     uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
 
-    sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
-    sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
-    sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
-    sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
 
   } while (++i < h);
 
-  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+  sum_u32[0] = vpaddlq_u16(sum[0]);
+  sum_u32[1] = vpaddlq_u16(sum[1]);
+  sum_u32[2] = vpaddlq_u16(sum[2]);
+  sum_u32[3] = vpaddlq_u16(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
 }
 
 static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
index 90971f6009..813710040b 100644
--- a/vpx_dsp/arm/highbd_sad_neon.c
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -44,20 +44,19 @@ static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
                                           int ref_stride, int h) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t sum = vdupq_n_u16(0);
 
   int i = h;
   do {
     uint16x8_t s = vld1q_u16(src16_ptr);
     uint16x8_t r = vld1q_u16(ref16_ptr);
-    uint16x8_t diff = vabdq_u16(s, r);
-    sum = vpadalq_u16(sum, diff);
+    sum = vabaq_u16(sum, s, r);
 
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
   } while (--i != 0);
 
-  return horizontal_add_uint32x4(sum);
+  return horizontal_add_uint16x8(sum);
 }
 
 static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,

From 0f893ea0b6a8843d825f97b12ecf78443f0e93d2 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 29 Mar 2023 13:06:19 -0400
Subject: [PATCH 639/926] svc: Fix a case where target bandwidth is 0

Bug: webrtc:15033
Change-Id: Iea2997c2ce8982f106a1eed3ec4f7dd1c6e83666
---
 vp9/encoder/vp9_svc_layercontext.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index ac8b09cf31..83b6e5c99d 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -254,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
 
       lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
 
-      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      if (target_bandwidth != 0) {
+        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      }
       // Update buffer-related quantities.
       lrc->starting_buffer_level =
           (int64_t)(rc->starting_buffer_level * bitrate_alloc);

From c1c7dd3138796c512c441fba901d1a48ef7d61d1 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Mon, 27 Mar 2023 08:47:58 +0000
Subject: [PATCH 640/926] Use sum_neon.h helpers in Neon DC predictors

Use sum_neon.h helpers for horizontal reductions in Neon DC predictors,
enabling use of dedicated Neon reduction instructions on AArch64. Some
of the surrounding code is also optimized to remove redundant broadcast
instructions in the dc_store helpers.

Performance is largely unchanged on both the standard as well as the
high bit-depth predictors. The main improvement appears to be the 16x16
standard-bitdepth dc predictor, which improves by 10-15% when
benchmarked on Neoverse N1.

Change-Id: Ibfcc6ecf4b1b2f87ce1e1f63c314d0cc35a0c76f
---
 vpx_dsp/arm/highbd_intrapred_neon.c | 118 ++++++++++-------------
 vpx_dsp/arm/intrapred_neon.c        | 139 +++++++++++-----------------
 vpx_dsp/arm/sum_neon.h              |  43 +++++++++
 3 files changed, 143 insertions(+), 157 deletions(-)

diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index b4a69017d2..235cb5b996 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -12,23 +12,22 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "sum_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_4(const uint16_t *ref) {
   const uint16x4_t ref_u16 = vld1_u16(ref);
-  const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
-  return vpadd_u16(p0, p0);
+  return horizontal_add_uint16x4(ref_u16);
 }
 
 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
                                 const uint16x4_t dc) {
-  const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    vst1_u16(dst, dc_dup);
+    vst1_u16(dst, dc);
   }
 }
 
@@ -37,21 +36,17 @@ void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                       const uint16_t *left, int bd) {
   const uint16x4_t a = vld1_u16(above);
   const uint16x4_t l = vld1_u16(left);
-  uint16x4_t sum;
-  uint16x4_t dc;
+  const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l));
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3);
   (void)bd;
-  sum = vadd_u16(a, l);
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vrshr_n_u16(sum, 3);
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_4(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  const uint16_t sum = dc_sum_4(left);
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
   (void)above;
   (void)bd;
   dc_store_4x4(dst, stride, dc);
@@ -60,8 +55,8 @@ void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_4(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  const uint16_t sum = dc_sum_4(above);
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
   (void)left;
   (void)bd;
   dc_store_4x4(dst, stride, dc);
@@ -79,19 +74,16 @@ void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_8(const uint16_t *ref) {
   const uint16x8_t ref_u16 = vld1q_u16(ref);
-  uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+  return horizontal_add_uint16x8(ref_u16);
 }
 
 static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
-                                const uint16x4_t dc) {
-  const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+                                const uint16x8_t dc) {
   int i;
   for (i = 0; i < 8; ++i, dst += stride) {
-    vst1q_u16(dst, dc_dup);
+    vst1q_u16(dst, dc);
   }
 }
 
@@ -101,20 +93,17 @@ void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
   const uint16x8_t above_u16 = vld1q_u16(above);
   const uint16x8_t left_u16 = vld1q_u16(left);
   const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  uint16x4_t dc;
+  const uint16_t sum = horizontal_add_uint16x8(p0);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)bd;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vrshr_n_u16(sum, 4);
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_8(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  const uint16_t sum = dc_sum_8(left);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
   (void)above;
   (void)bd;
   dc_store_8x8(dst, stride, dc);
@@ -123,8 +112,8 @@ void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_8(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  const uint16_t sum = dc_sum_8(above);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
   (void)left;
   (void)bd;
   dc_store_8x8(dst, stride, dc);
@@ -133,7 +122,7 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_8x8(dst, stride, dc);
@@ -142,22 +131,19 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_16(const uint16_t *ref) {
   const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
   const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
   const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+  return horizontal_add_uint16x8(p0);
 }
 
 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
-                                  const uint16x4_t dc) {
-  uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+                                  const uint16x8_t dc) {
   int i;
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst1q_u16(dst + 0, dc_dup);
-    vst1q_u16(dst + 8, dc_dup);
+    vst1q_u16(dst + 0, dc);
+    vst1q_u16(dst + 8, dc);
   }
 }
 
@@ -171,21 +157,17 @@ void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
   const uint16x8_t pa = vaddq_u16(a0, a1);
   const uint16x8_t pl = vaddq_u16(l0, l1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
-  uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
-  uint32x2_t sum;
-  uint16x4_t dc;
+  const uint32_t sum = horizontal_add_uint16x8(pal0);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)bd;
-  pal1 = vpadd_u16(pal1, pal1);
-  sum = vpaddl_u16(pal1);
-  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
   dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_16(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  const uint16_t sum = dc_sum_16(left);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)above;
   (void)bd;
   dc_store_16x16(dst, stride, dc);
@@ -194,8 +176,8 @@ void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_16(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  const uint16_t sum = dc_sum_16(above);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)left;
   (void)bd;
   dc_store_16x16(dst, stride, dc);
@@ -204,7 +186,7 @@ void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_16x16(dst, stride, dc);
@@ -213,7 +195,7 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
+static INLINE uint32_t dc_sum_32(const uint16_t *ref) {
   const uint16x8_t r0 = vld1q_u16(ref + 0);
   const uint16x8_t r1 = vld1q_u16(ref + 8);
   const uint16x8_t r2 = vld1q_u16(ref + 16);
@@ -221,20 +203,17 @@ static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
   const uint16x8_t p0 = vaddq_u16(r0, r1);
   const uint16x8_t p1 = vaddq_u16(r2, r3);
   const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  sum = vpadd_u16(sum, sum);
-  return vpaddl_u16(sum);
+  return horizontal_add_uint16x8(p2);
 }
 
 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
-                                  const uint16x4_t dc) {
-  uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+                                  const uint16x8_t dc) {
   int i;
   for (i = 0; i < 32; ++i) {
-    vst1q_u16(dst + 0, dc_dup);
-    vst1q_u16(dst + 8, dc_dup);
-    vst1q_u16(dst + 16, dc_dup);
-    vst1q_u16(dst + 24, dc_dup);
+    vst1q_u16(dst + 0, dc);
+    vst1q_u16(dst + 8, dc);
+    vst1q_u16(dst + 16, dc);
+    vst1q_u16(dst + 24, dc);
     dst += stride;
   }
 }
@@ -257,20 +236,17 @@ void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
   const uint16x8_t pa = vaddq_u16(pa0, pa1);
   const uint16x8_t pl = vaddq_u16(pl0, pl1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
-  const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
-  uint32x2_t sum = vpaddl_u16(pal1);
-  uint16x4_t dc;
+  const uint32_t sum = horizontal_add_uint16x8(pal0);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0);
   (void)bd;
-  sum = vpadd_u32(sum, sum);
-  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
   dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
-  const uint32x2_t sum = dc_sum_32(left);
-  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  const uint32_t sum = dc_sum_32(left);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)above;
   (void)bd;
   dc_store_32x32(dst, stride, dc);
@@ -279,8 +255,8 @@ void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint32x2_t sum = dc_sum_32(above);
-  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  const uint32_t sum = dc_sum_32(above);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)left;
   (void)bd;
   dc_store_32x32(dst, stride, dc);
@@ -289,7 +265,7 @@ void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_32x32(dst, stride, dc);
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index d1f6f6da9f..d9b4db2eab 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -13,51 +13,46 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "mem_neon.h"
+#include "sum_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) {
-  const uint8x8_t ref_u8 = vld1_u8(ref);
-  const uint16x4_t p0 = vpaddl_u8(ref_u8);
-  return vpadd_u16(p0, p0);
+static INLINE uint16_t dc_sum_4(const uint8_t *ref) {
+  return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref));
 }
 
 static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
                                 const uint8x8_t dc) {
-  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0);
   }
 }
 
 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t a = vld1_u8(above);
-  const uint8x8_t l = vld1_u8(left);
-  const uint16x8_t al = vaddl_u8(a, l);
-  uint16x4_t sum;
-  uint8x8_t dc;
-  sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al));
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint8x8_t a = load_unaligned_u8_4x1(above);
+  const uint8x8_t l = load_unaligned_u8_4x1(left);
+  const uint16x4_t al = vget_low_u16(vaddl_u8(a, l));
+  const uint16_t sum = horizontal_add_uint16x4(al);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_4(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
+  const uint16_t sum = dc_sum_4(left);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
   (void)above;
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_4(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
+  const uint16_t sum = dc_sum_4(above);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
   (void)left;
   dc_store_4x4(dst, stride, dc);
 }
@@ -73,19 +68,15 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) {
-  const uint8x8_t ref_u8 = vld1_u8(ref);
-  uint16x4_t sum = vpaddl_u8(ref_u8);
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_8(const uint8_t *ref) {
+  return horizontal_add_uint8x8(vld1_u8(ref));
 }
 
 static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
                                 const uint8x8_t dc) {
-  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
   int i;
   for (i = 0; i < 8; ++i, dst += stride) {
-    vst1_u8(dst, dc_dup);
+    vst1_u8(dst, dc);
   }
 }
 
@@ -93,28 +84,24 @@ void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const uint8x8_t above_u8 = vld1_u8(above);
   const uint8x8_t left_u8 = vld1_u8(left);
-  const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8);
-  const uint16x8_t p0 = vpaddlq_u8(above_and_left);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16x8_t al = vaddl_u8(above_u8, left_u8);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4);
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_8(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint16_t sum = dc_sum_8(left);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   (void)above;
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_8(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint16_t sum = dc_sum_8(above);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   (void)left;
   dc_store_8x8(dst, stride, dc);
 }
@@ -130,20 +117,15 @@ void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) {
-  const uint8x16_t ref_u8 = vld1q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(ref_u8);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_16(const uint8_t *ref) {
+  return horizontal_add_uint8x16(vld1q_u8(ref));
 }
 
 static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8x8_t dc) {
-  const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+                                  const uint8x16_t dc) {
   int i;
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc);
   }
 }
 
@@ -151,22 +133,19 @@ void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x16_t ref0 = vld1q_u8(above);
   const uint8x16_t ref1 = vld1q_u8(left);
-  const uint16x8_t p0 = vpaddlq_u8(ref0);
-  const uint16x8_t p1 = vpaddlq_u8(ref1);
-  const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16x8_t a = vpaddlq_u8(ref0);
+  const uint16x8_t l = vpaddlq_u8(ref1);
+  const uint16x8_t al = vaddq_u16(a, l);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_16(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16_t sum = dc_sum_16(left);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
   (void)above;
   dc_store_16x16(dst, stride, dc);
 }
@@ -174,8 +153,8 @@ void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_16(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16_t sum = dc_sum_16(above);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
   (void)left;
   dc_store_16x16(dst, stride, dc);
 }
@@ -183,7 +162,7 @@ void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc = vdup_n_u8(0x80);
+  const uint8x16_t dc = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
   dc_store_16x16(dst, stride, dc);
@@ -192,24 +171,19 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
+static INLINE uint16_t dc_sum_32(const uint8_t *ref) {
   const uint8x16_t r0 = vld1q_u8(ref + 0);
   const uint8x16_t r1 = vld1q_u8(ref + 16);
-  const uint16x8_t p0 = vpaddlq_u8(r0);
-  const uint16x8_t p1 = vpaddlq_u8(r1);
-  const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+  const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1));
+  return horizontal_add_uint16x8(r01);
 }
 
 static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8x8_t dc) {
-  uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+                                  const uint8x16_t dc) {
   int i;
   for (i = 0; i < 32; ++i, dst += stride) {
-    vst1q_u8(dst + 0, dc_dup);
-    vst1q_u8(dst + 16, dc_dup);
+    vst1q_u8(dst + 0, dc);
+    vst1q_u8(dst + 16, dc);
   }
 }
 
@@ -219,26 +193,19 @@ void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t a1 = vld1q_u8(above + 16);
   const uint8x16_t l0 = vld1q_u8(left + 0);
   const uint8x16_t l1 = vld1q_u8(left + 16);
-  const uint16x8_t pa0 = vpaddlq_u8(a0);
-  const uint16x8_t pl0 = vpaddlq_u8(l0);
-  const uint16x8_t pa1 = vpaddlq_u8(a1);
-  const uint16x8_t pl1 = vpaddlq_u8(l1);
-  const uint16x8_t pa = vaddq_u16(pa0, pa1);
-  const uint16x8_t pl = vaddq_u16(pl0, pl1);
-  const uint16x8_t pal = vaddq_u16(pa, pl);
-  uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6));
+  const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1));
+  const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1));
+  const uint16x8_t al = vaddq_u16(a01, l01);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0);
   dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_32(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16_t sum = dc_sum_32(left);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   (void)above;
   dc_store_32x32(dst, stride, dc);
 }
@@ -246,8 +213,8 @@ void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_32(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16_t sum = dc_sum_32(above);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   (void)left;
   dc_store_32x32(dst, stride, dc);
 }
@@ -255,7 +222,7 @@ void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc = vdup_n_u8(0x80);
+  const uint8x16_t dc = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
   dc_store_32x32(dst, stride, dc);
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 8291f07296..1eb3484767 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -16,6 +16,49 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
+static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
+#if defined(__aarch64__)
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t b = vpaddl_u8(a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
+#if defined(__aarch64__)
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t b = vpaddl_u8(a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  const uint16x4_t d = vpadd_u16(c, c);
+  return vget_lane_u16(d, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_u8(a);
+#else
+  const uint16x8_t b = vpaddlq_u8(a);
+  const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b));
+  const uint16x4_t d = vpadd_u16(c, c);
+  const uint16x4_t e = vpadd_u16(d, d);
+  return vget_lane_u16(e, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
+#if defined(__aarch64__)
+  return vaddv_u16(a);
+#else
+  const uint16x4_t b = vpadd_u16(a, a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  return vget_lane_u16(c, 0);
+#endif
+}
+
 static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
 #if defined(__aarch64__)
   return vaddlvq_s16(a);

From a257b4d6be525c50aea1e9f33f791fd4b627e92b Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 28 Mar 2023 14:49:37 +0000
Subject: [PATCH 641/926] Avoid vshr and vget_{low,high} in Neon d135 predictor
 impl

The shift instructions have marginally worse performance on some
micro-architectures, and the vget_{low,high} instructions are
unnecessary.

This commit improves performance of the d135 predictors by 1.5% geomean
averaged across a range of compilers and micro-architectures.

Change-Id: Ied4c3eecc12fc973841696459d868ce403ed4e6c
---
 vpx_dsp/arm/intrapred_neon.c | 57 +++++++++++-------------------------
 1 file changed, 17 insertions(+), 40 deletions(-)

diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index d9b4db2eab..4f909e4935 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -866,22 +866,14 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x8_t L3210 = vrev64_u8(L0123);
   const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
   const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
-  const uint8x8_t L10XA0123_ =
-      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8));
+  const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1);
   const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
   const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)dst, r0, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r1, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r2, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r3, 0);
+
+  store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3));
+  store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2));
+  store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1));
+  store_u8_4x1(dst + 3 * stride, avg2);
 }
 
 void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -898,31 +890,15 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
   const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
   const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
-  const uint8x8_t row_0 = vget_low_u8(row);
-  const uint8x8_t row_1 = vget_high_u8(row);
-  const uint8x8_t r0 = vext_u8(row_0, row_1, 7);
-  const uint8x8_t r1 = vext_u8(row_0, row_1, 6);
-  const uint8x8_t r2 = vext_u8(row_0, row_1, 5);
-  const uint8x8_t r3 = vext_u8(row_0, row_1, 4);
-  const uint8x8_t r4 = vext_u8(row_0, row_1, 3);
-  const uint8x8_t r5 = vext_u8(row_0, row_1, 2);
-  const uint8x8_t r6 = vext_u8(row_0, row_1, 1);
-
-  vst1_u8(dst, r0);
-  dst += stride;
-  vst1_u8(dst, r1);
-  dst += stride;
-  vst1_u8(dst, r2);
-  dst += stride;
-  vst1_u8(dst, r3);
-  dst += stride;
-  vst1_u8(dst, r4);
-  dst += stride;
-  vst1_u8(dst, r5);
-  dst += stride;
-  vst1_u8(dst, r6);
-  dst += stride;
-  vst1_u8(dst, row_0);
+
+  vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7)));
+  vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6)));
+  vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5)));
+  vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4)));
+  vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3)));
+  vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2)));
+  vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1)));
+  vst1_u8(dst + 7 * stride, vget_low_u8(row));
 }
 
 static INLINE void d135_store_16x8(
@@ -965,6 +941,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
   const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
   const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+
   const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
   const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
   const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
@@ -972,7 +949,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
   const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
   const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
-  const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1));
+  const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8);
   const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
   const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
   const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);

From 1025d37b03247c790723dac7f4084e04fd45f2b3 Mon Sep 17 00:00:00 2001
From: Cherma Rajan A <cherma.rajan@ittiam.com>
Date: Wed, 8 Mar 2023 17:50:06 +0530
Subject: [PATCH 642/926] Prune single ref modes based on mv difference and
 mode rate

This patch introduces a speed feature to prune single reference
modes - NEARMV and ZEROMV based on motion vector difference and
mode rate w.r.t previously evaluated single reference modes
corresponding to the same reference frame.

                Instruction Count        BD-Rate Loss(%)
cpu   Resolution   Reduction(%)    avg.psnr   ovr.psnr    ssim
 0       LOWRES2      1.686        -0.0039    -0.0105   -0.0098
 0       MIDRES2      1.026        -0.0234     0.0029    0.0120
 0        HDRES2      0.000         0.0000     0.0000    0.0000
 0       Average      0.889        -0.0091    -0.0025    0.0007

STATS_CHANGED

Change-Id: I387acd3a73d8256904a7ce684b198d251cf3dd04
---
 vp9/encoder/vp9_rdopt.c          | 71 +++++++++++++++++++++++++++++---
 vp9/encoder/vp9_speed_features.c |  9 ++++
 vp9/encoder/vp9_speed_features.h |  4 ++
 3 files changed, 79 insertions(+), 5 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 76d545cd96..a6a7befc11 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1854,6 +1854,52 @@ static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
   return 0;
 }
 
+// Compares motion vector and mode rate of current mode and given mode.
+static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv,
+                                       int this_mode_rate, int mode_rate,
+                                       int mv_thresh) {
+  const int mv_diff =
+      abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row);
+  if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1;
+  return 0;
+}
+
+// Skips single reference inter modes NEARMV and ZEROMV based on motion vector
+// difference and mode rate.
+static INLINE int skip_single_mode_based_on_mode_rate(
+    int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode,
+    int ref0, int this_mode_rate, int best_mode_index) {
+  MV this_mv = mode_mv[this_mode][ref0].as_mv;
+  const int mv_thresh = 3;
+
+  // Pruning is not applicable for NEARESTMV or NEWMV modes.
+  if (this_mode == NEARESTMV || this_mode == NEWMV) return 0;
+  // Pruning is not done when reference frame of the mode is same as best
+  // reference so far.
+  if (best_mode_index > 0 &&
+      ref0 == vp9_mode_order[best_mode_index].ref_frame[0])
+    return 0;
+
+  // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV
+  if (compare_mv_mode_rate(
+          this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate,
+          single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh))
+    return 1;
+
+  // Check absolute mv difference and mode rate of current mode w.r.t NEWMV
+  if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate,
+                           single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh))
+    return 1;
+
+  // Pruning w.r.t NEARMV is applicable only for ZEROMV mode
+  if (this_mode == NEARMV) return 0;
+  // Check absolute mv difference and mode rate of current mode w.r.t NEARMV
+  if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate,
+                           single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh))
+    return 1;
+  return 0;
+}
+
 #define NUM_ITERS 4
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                 int_mv *frame_mv, int mi_row, int mi_col,
@@ -2756,8 +2802,9 @@ static int64_t handle_inter_mode(
     struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES],
     int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
     INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
-    int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse,
-    const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) {
+    int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate,
+    int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter,
+    int64_t filter_cache[], int best_mode_index) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
@@ -2914,6 +2961,15 @@ static int64_t handle_inter_mode(
     *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
   }
 
+  if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) {
+    single_mode_rate[INTER_OFFSET(this_mode)] = *rate2;
+    // Prune NEARMV and ZEROMV modes based on motion vector difference and mode
+    // rate.
+    if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate,
+                                            this_mode, refs[0], *rate2,
+                                            best_mode_index))
+      return INT64_MAX;
+  }
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
       mi->mode != NEARESTMV)
     return INT64_MAX;
@@ -3380,6 +3436,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int single_mode_rate[MAX_REF_FRAMES][INTER_MODES];
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3578,6 +3635,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
     vp9_zero(x->sum_y_eobs);
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (!comp_pred && ref_frame != INTRA_FRAME &&
+        sf->prune_single_mode_based_on_mv_diff_mode_rate)
+      single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX;
 
     if (is_rect_partition) {
       if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue;
@@ -3663,7 +3724,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (this_mode == NEARMV || this_mode == ZEROMV) continue;
     }
 
-    comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
 
@@ -3783,8 +3843,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       this_rd = handle_inter_mode(
           cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
           recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
-          single_inter_filter, single_skippable, &total_sse, best_rd,
-          &mask_filter, filter_cache);
+          single_inter_filter, single_skippable,
+          &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter,
+          filter_cache, best_mode_index);
 #if CONFIG_COLLECT_COMPONENT_TIMING
       end_timing(cpi, handle_inter_mode_time);
 #endif
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 3e121b799f..0522d4ec97 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -70,6 +70,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   const int is_720p_or_larger = min_frame_size >= 720;
   const int is_1080p_or_larger = min_frame_size >= 1080;
   const int is_2160p_or_larger = min_frame_size >= 2160;
+  const int boosted = frame_is_boosted(cpi);
 
   // speed 0 features
   sf->partition_search_breakout_thr.dist = (1 << 20);
@@ -102,6 +103,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
     }
   }
 
+  if (!is_720p_or_larger) {
+    if (is_480p_or_larger)
+      sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1;
+    else
+      sf->prune_single_mode_based_on_mv_diff_mode_rate = 1;
+  }
+
   if (speed >= 1) {
     sf->rd_ml_partition.search_early_termination = 0;
     sf->rd_ml_partition.search_breakout = 1;
@@ -926,6 +934,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->enhanced_full_pixel_motion_search = 1;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
+  sf->prune_single_mode_based_on_mv_diff_mode_rate = 0;
   sf->cb_pred_filter_search = 0;
   sf->early_term_interp_search_plane_rd = 0;
   sf->cb_partition_search = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index d32bf09e4e..e267e55c41 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -417,6 +417,10 @@ typedef struct SPEED_FEATURES {
   // Adaptive prediction mode search
   int adaptive_mode_search;
 
+  // Prune NEAREST and ZEROMV single reference modes based on motion vector
+  // difference and mode rate
+  int prune_single_mode_based_on_mv_diff_mode_rate;
+
   // Chessboard pattern prediction for interp filter. Aggressiveness increases
   // with levels.
   // 0: disable

From e2465dfc2515e0872524b627a647d1613dfeae13 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Tue, 28 Mar 2023 14:48:46 +0530
Subject: [PATCH 643/926] Add AVX2 intrinsic for variance function for block
 width 8

Added AVX2 intrinsic optimization for the following functions
1. vpx_variance8x4
2. vpx_variance8x8
3. vpx_variance8x16

This is a bit-exact change.

                 Instruction Count
cpu   Resolution   Reduction(%)
 0       LOWRES2      0.698
 0       MIDRES2      0.577
 0        HDRES2      0.469
 0       Average      0.582

Change-Id: Iae8fdf9344fd012cda4955ed140633141d60ba86
---
 test/variance_test.cc        |  5 ++-
 vpx_dsp/vpx_dsp_rtcd_defs.pl |  6 +--
 vpx_dsp/x86/variance_avx2.c  | 80 ++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 1359bc4baf..df9a1c56f6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1429,7 +1429,10 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(5, 4, &vpx_variance32x16_avx2),
                       VarianceParams(4, 5, &vpx_variance16x32_avx2),
                       VarianceParams(4, 4, &vpx_variance16x16_avx2),
-                      VarianceParams(4, 3, &vpx_variance16x8_avx2)));
+                      VarianceParams(4, 3, &vpx_variance16x8_avx2),
+                      VarianceParams(3, 4, &vpx_variance8x16_avx2),
+                      VarianceParams(3, 3, &vpx_variance8x8_avx2),
+                      VarianceParams(3, 2, &vpx_variance8x4_avx2)));
 
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VpxSubpelVarianceTest,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 49bc9a6309..d63be5fb8f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1123,13 +1123,13 @@ ()
   specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x16 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance8x8 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x4 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/;
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 35925d5908..8305b9f20f 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -98,6 +98,41 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
   return _mm256_add_epi32(sum_lo, sum_hi);
 }
 
+static INLINE void variance8_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  __m128i src0, src1, ref0, ref1;
+  __m256i ss, rr, diff;
+
+  // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00
+  src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride));
+
+  // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10
+  src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride));
+
+  // s17 s16...s11 s10 s07 s06...s01 s00 (8bit)
+  src0 = _mm_unpacklo_epi64(src0, src1);
+
+  // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit)
+  ss = _mm256_cvtepu8_epi16(src0);
+
+  // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00
+  ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride));
+
+  // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10
+  ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride));
+
+  // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit)
+  ref0 = _mm_unpacklo_epi64(ref0, ref1);
+
+  // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit)
+  rr = _mm256_cvtepu8_epi16(ref0);
+
+  diff = _mm256_sub_epi16(ss, rr);
+  *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff));
+  *sum = _mm256_add_epi16(*sum, diff);
+}
+
 static INLINE void variance16_kernel_avx2(
     const uint8_t *const src, const int src_stride, const uint8_t *const ref,
     const int ref_stride, __m256i *const sse, __m256i *const sum) {
@@ -119,6 +154,21 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src,
   variance_kernel_avx2(s, r, sse, sum);
 }
 
+static INLINE void variance8_avx2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m256i *const vsse,
+                                  __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+  *vsse = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i += 2) {
+    variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
 static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m256i *const vsse,
@@ -612,6 +662,36 @@ typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sse, int *sum);
 
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 7);
+}
+
 unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sse) {

From a5801b00a8e3e440392c6bbc31754ba3c206ff41 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 4 Apr 2023 14:52:52 +0100
Subject: [PATCH 644/926] Optimize 4D Neon reduction for 4xh and 8xh SAD4D
 blocks

Add a 4D reduction function operating on uint16x8_t vectors and use
it to optimize the final reduction in standard bitdepth 4xh and 8xh
SAD4D computations. Similar 4D reduction optimizations have already
been implemented for all other standard bitdepth block sizes, and all
high bitdepth block sizes.[1]

[1] https://chromium-review.googlesource.com/c/webm/libvpx/+/4224681

Change-Id: I0aa0b6e0f70449776f316879cafc4b830e86ea51
---
 vpx_dsp/arm/sad4d_neon.c | 10 ++--------
 vpx_dsp/arm/sum_neon.h   | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 9509573939..ab00e0e3a2 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -270,10 +270,7 @@ static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
     i++;
   } while (i < h);
 
-  res[0] = horizontal_add_uint16x8(sum[0]);
-  res[1] = horizontal_add_uint16x8(sum[1]);
-  res[2] = horizontal_add_uint16x8(sum[2]);
-  res[3] = horizontal_add_uint16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
 static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
@@ -298,10 +295,7 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
     i += 2;
   } while (i < h);
 
-  res[0] = horizontal_add_uint16x8(sum[0]);
-  res[1] = horizontal_add_uint16x8(sum[1]);
-  res[2] = horizontal_add_uint16x8(sum[2]);
-  res[3] = horizontal_add_uint16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
 #define SAD_WXH_4D_NEON(w, h)                                                  \
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 1eb3484767..6259add4a4 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -83,6 +83,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
 #endif
 }
 
+static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  return vpaddlq_u16(b0);
+#else
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
 static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
                                                     const uint16x8_t vec_hi) {
 #if defined(__aarch64__)

From ff8a9658568061e12e556d1f41754b94a8c30498 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 6 Apr 2023 16:14:51 +0100
Subject: [PATCH 645/926] Optimize Armv8.0 Neon SAD4D 16xh, 32xh, and 64xh
 functions

Add a widening 4D reduction function operating on uint16x8_t vectors
and use it to optimize the final reduction in Armv8.0 Neon standard
bitdepth 16xh, 32xh and 64h SAD4D computations.

Also simplify the Armv8.0 Neon version of the sad64xhx4d_neon helper
function since VP9 block sizes are not large enough to require
widening to 32-bit accumulators before the final reduction.

Change-Id: I32b0a283d7688d8cdf21791add9476ed24c66a28
---
 vpx_dsp/arm/sad4d_neon.c | 88 ++++++++++++++++------------------------
 vpx_dsp/arm/sum_neon.h   | 25 ++++++++++++
 2 files changed, 61 insertions(+), 52 deletions(-)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index ab00e0e3a2..6ad6c96214 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -140,53 +140,43 @@ static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
 static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[4], int ref_stride,
                                    uint32_t res[4], int h) {
-  int h_tmp = h > 64 ? 64 : h;
-  int i = 0;
-  vst1q_u32(res, vdupq_n_u32(0));
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
 
+  int i = 0;
   do {
-    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                             vdupq_n_u16(0) };
-    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                             vdupq_n_u16(0) };
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-
-      s0 = vld1q_u8(src + i * src_stride);
-      sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
-      sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
-      sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
-      sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
-
-      s1 = vld1q_u8(src + i * src_stride + 16);
-      sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
-      sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
-      sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
-      sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
-
-      s2 = vld1q_u8(src + i * src_stride + 32);
-      sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
-      sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
-      sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
-      sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
-
-      s3 = vld1q_u8(src + i * src_stride + 48);
-      sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
-      sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
-      sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
-      sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
-
-      i++;
-    } while (i < h_tmp);
-
-    res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
-    res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
-    res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
-    res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
-
-    h_tmp += 64;
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+    i++;
   } while (i < h);
+
+  vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
 }
 
 static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
@@ -216,10 +206,7 @@ static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
     i++;
   } while (i < h);
 
-  res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
-  res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
-  res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
-  res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
+  vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
 }
 
 static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
@@ -239,10 +226,7 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
     i++;
   } while (i < h);
 
-  res[0] = horizontal_add_uint16x8(sum[0]);
-  res[1] = horizontal_add_uint16x8(sum[1]);
-  res[2] = horizontal_add_uint16x8(sum[2]);
-  res[3] = horizontal_add_uint16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
 #endif  // defined(__ARM_FEATURE_DOTPROD)
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 6259add4a4..a0c72f92ce 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -117,6 +117,31 @@ static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
 #endif
 }
 
+static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
+    const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+  const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+  const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+  const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+  const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+  const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if defined(__aarch64__)
+  const uint32x4_t c0 = vpaddq_u32(b0, b1);
+  const uint32x4_t c1 = vpaddq_u32(b2, b3);
+  return vpaddq_u32(c0, c1);
+#else
+  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+  const uint32x2_t d0 = vpadd_u32(c0, c1);
+  const uint32x2_t d1 = vpadd_u32(c2, c3);
+  return vcombine_u32(d0, d1);
+#endif
+}
+
 static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
 #if defined(__aarch64__)
   return vaddv_s32(a);

From 868674d3300478865c7ed4ca40e6e28e249f1f11 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 6 Apr 2023 12:57:23 -0700
Subject: [PATCH 646/926] vpx_subpixel_8t_intrin_avx2: clear -Wshadow warning

Bug: webm:1793
Change-Id: Icba4ad242dcd0cad736b9a203829361c5bd1ca3f
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 3b5ff04ee9..9ff67bd301 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -240,7 +240,7 @@ static void vpx_filter_block1d8_h8_avx2(
 
   // For the remaining height.
   if (y > 0) {
-    const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     f[0] = _mm256_castsi256_si128(f1[0]);
     f[1] = _mm256_castsi256_si128(f1[1]);
@@ -248,10 +248,10 @@ static void vpx_filter_block1d8_h8_avx2(
     f[3] = _mm256_castsi256_si128(f1[3]);
 
     // filter the source buffer
-    s[0] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]));
-    s[1] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]));
-    s[2] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]));
-    s[3] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]));
+    s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0]));
+    s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1]));
+    s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2]));
+    s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3]));
     s[0] = convolve8_8_ssse3(s, f);
 
     // Saturate 16bit value to 8bit.

From bebc860915188e2fda3983c7d8504371beb2f518 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 6 Apr 2023 13:00:07 -0700
Subject: [PATCH 647/926] vp9_encoder: clear -Wshadow warning

Bug: webm:1793
Change-Id: Id390c61f82b9f15063d0310a2c252b02b479d9c5
---
 vp9/encoder/vp9_encoder.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 4cec02eb93..72a6189d13 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2465,11 +2465,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
         cpi->svc.number_temporal_layers > 1) {
       FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
       FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 };
-      int i;
+      int n;
 
-      for (i = 0; i < oxcf->ss_number_layers; ++i) {
+      for (n = 0; n < oxcf->ss_number_layers; ++n) {
         FIRSTPASS_STATS *const last_packet_for_layer =
-            &stats[packets - oxcf->ss_number_layers + i];
+            &stats[packets - oxcf->ss_number_layers + n];
         const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
         const int packets_in_layer = (int)last_packet_for_layer->count + 1;
         if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
@@ -2494,11 +2494,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
         }
       }
 
-      for (i = 0; i < packets; ++i) {
-        const int layer_id = (int)stats[i].spatial_layer_id;
+      for (n = 0; n < packets; ++n) {
+        const int layer_id = (int)stats[n].spatial_layer_id;
         if (layer_id >= 0 && layer_id < oxcf->ss_number_layers &&
             stats_copy[layer_id] != NULL) {
-          *stats_copy[layer_id] = stats[i];
+          *stats_copy[layer_id] = stats[n];
           ++stats_copy[layer_id];
         }
       }

From 12ab4af3aefb320996ace24083fd26e857f4c533 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 6 Apr 2023 13:00:47 -0700
Subject: [PATCH 648/926] vp9_dx_iface: clear -Wshadow warnings

Bug: webm:1793
Change-Id: Ice6cd08f145e5813e24345d03e0913e5eda5289f
---
 vp9/vp9_dx_iface.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index bdfe217936..20e71cc227 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -348,7 +348,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
 
   // Initialize the decoder on the first frame.
   if (ctx->pbi == NULL) {
-    const vpx_codec_err_t res = init_decoder(ctx);
+    res = init_decoder(ctx);
     if (res != VPX_CODEC_OK) return res;
   }
 
@@ -367,7 +367,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
     for (i = 0; i < frame_count; ++i) {
       const uint8_t *data_start_copy = data_start;
       const uint32_t frame_size = frame_sizes[i];
-      vpx_codec_err_t res;
       if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) {
         set_error_detail(ctx, "Invalid frame size in index");
         return VPX_CODEC_CORRUPT_FRAME;
@@ -382,8 +381,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
     const uint8_t *const data_end = data + data_sz;
     while (data_start < data_end) {
       const uint32_t frame_size = (uint32_t)(data_end - data_start);
-      const vpx_codec_err_t res =
-          decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+      res = decode_one(ctx, &data_start, frame_size, user_priv, deadline);
       if (res != VPX_CODEC_OK) return res;
 
       // Account for suboptimal termination by the encoder.

From 61709a177aa8ef60dbc52e4409beb0d486095d55 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 10 Apr 2023 13:29:02 -0700
Subject: [PATCH 649/926] vp9_quantize_avx2,highbd_get_max_lane_eob: fix mask

Pack nz_mask with zero. After the result is permuted this has the effect
of ignoring the upper half of the iscan register which is only loaded
with 128-bits. Depending on the optimization level and the load used the
upper half of the ymm register may contain undefined values which can
produce an incorrect eob. If this is large enough it can cause a crash.

Bug: chromium:1431729
Change-Id: I4ebae9fa39f228bdd29dcc19935f3f07759d75f5
---
 vp9/encoder/x86/vp9_quantize_avx2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index da285be8e7..e6aa71d58a 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -295,7 +295,8 @@ static VPX_FORCE_INLINE void highbd_load_fp_values(
 
 static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
     const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
-  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask =
+      _mm256_packs_epi32(nz_mask, _mm256_setzero_si256());
   const __m256i packed_nz_mask_perm =
       _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
   const __m256i iscan =

From 987ed6937bf27ec5a4e4cb136aa653104adfb068 Mon Sep 17 00:00:00 2001
From: Deepa K G <deepa.kg@ittiam.com>
Date: Mon, 3 Apr 2023 23:21:56 +0530
Subject: [PATCH 650/926] Avoid redundant start MV SAD calculation

Avoided repeated calculation of start MV
SAD during full pixel motion search.

                 Instruction Count
cpu   Resolution   Reduction(%)
 0       LOWRES2      0.162
 0       MIDRES2      0.246
 0        HDRES2      0.325
 0       Average      0.245

Change-Id: I2b4786901f254ce32ee8ca8a3d56f1c9f112f1d4
---
 vp9/common/vp9_rtcd_defs.pl                   |  2 +-
 .../arm/neon/vp9_diamond_search_sad_neon.c    | 40 +++-------------
 vp9/encoder/vp9_encoder.h                     | 34 +++++++++++++
 vp9/encoder/vp9_firstpass.c                   | 18 +++++--
 vp9/encoder/vp9_mcomp.c                       | 48 +++++++------------
 vp9/encoder/vp9_mcomp.h                       |  6 +--
 vp9/encoder/x86/vp9_diamond_search_sad_avx.c  | 40 +++-------------
 7 files changed, 82 insertions(+), 106 deletions(-)

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 7f77a36d6e..5e60792556 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -171,7 +171,7 @@ ()
 #
 # Motion search
 #
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad avx neon/;
 
 #
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 997775a668..15334b413b 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -30,30 +30,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
   return result;
 }
 
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
-  assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
-  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
-         comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff =
-      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
 /*****************************************************************************
  * This function utilizes 3 properties of the cost function lookup tables,   *
  * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
@@ -71,8 +47,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
  *****************************************************************************/
 int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
                                 const search_site_config *cfg, MV *ref_mv,
-                                MV *best_mv, int search_param, int sad_per_bit,
-                                int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+                                uint32_t start_mv_sad, MV *best_mv,
+                                int search_param, int sad_per_bit, int *num00,
+                                const vp9_variance_fn_ptr_t *fn_ptr,
                                 const MV *center_mv) {
   static const uint32_t data[4] = { 0, 1, 2, 3 };
   const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
@@ -101,8 +78,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
   const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
 
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+  const int ref_row = ref_mv->row;
+  const int ref_col = ref_mv->col;
 
   int_mv bmv = pack_int_mv(ref_row, ref_col);
   int_mv new_bmv = bmv;
@@ -122,7 +99,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
 #else
   int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
 #endif
-  unsigned int best_sad = INT_MAX;
+  // Starting position
+  unsigned int best_sad = start_mv_sad;
   int i, j, step;
 
   // Check the prerequisite cost function properties that are easy to check
@@ -131,10 +109,6 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
   assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
   assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
 
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
   *num00 = 0;
 
   for (i = 0, step = 0; step < tot_steps; step++) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 442ef1899c..9e5e64629e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1479,6 +1479,40 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
   }
 }
 
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
+  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
+  return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+         comp_cost[1][mv->col];
+}
+
+static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv,
+                                 const MV *ref, int sad_per_bit) {
+  MV diff;
+  diff.row = mv->row - ref->row;
+  diff.col = mv->col - ref->col;
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+      VP9_PROB_COST_SHIFT);
+}
+
+static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full,
+                                        const MV *ref_mv_full,
+                                        vpx_sad_fn_t sad_fn_ptr, int sadpb) {
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *const pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  uint32_t start_mv_sad =
+      sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb);
+
+  return start_mv_sad;
+}
+
 static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
                                   int subsampling_dim, int blk_dim) {
   return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 08b68c93ee..0efa836aca 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -435,6 +435,8 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+  MV center_mv_full = ref_mv_full;
+  unsigned int start_mv_sad;
 
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
@@ -455,9 +457,15 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  // Calculate SAD of the start mv
+  clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
+                                  cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+
   // Center the initial step/diamond search on best mv.
-  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                    step_param, x->sadperbit16, &num00,
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
+                                    &tmp_mv, step_param, x->sadperbit16, &num00,
                                     &v_fn_ptr, ref_mv);
   if (tmp_err < INT_MAX)
     tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
@@ -478,9 +486,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     if (num00) {
       --num00;
     } else {
-      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                        step_param + n, x->sadperbit16, &num00,
-                                        &v_fn_ptr, ref_mv);
+      tmp_err = cpi->diamond_search_sad(
+          x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
+          x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
       if (tmp_err < INT_MAX)
         tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 207eb43949..4ff685b242 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -77,14 +77,6 @@ int vp9_init_search_range(int size) {
   return sr;
 }
 
-static INLINE int mv_cost(const MV *mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
-  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
-  return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
-         comp_cost[1][mv->col];
-}
-
 int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
                     int *mvcost[2], int weight) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
@@ -103,15 +95,6 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
   }
   return 0;
 }
-
-static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
-                          int sad_per_bit) {
-  const MV diff = { mv->row - ref->row, mv->col - ref->col };
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
   int len;
   int ss_count = 0;
@@ -2070,8 +2053,8 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
 #endif  // CONFIG_NON_GREEDY_MV
 
 int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
-                             MV *ref_mv, MV *best_mv, int search_param,
-                             int sad_per_bit, int *num00,
+                             MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
+                             int search_param, int sad_per_bit, int *num00,
                              const vp9_variance_fn_ptr_t *fn_ptr,
                              const MV *center_mv) {
   int i, j, step;
@@ -2083,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   const int in_what_stride = xd->plane[0].pre[0].stride;
   const uint8_t *best_address;
 
-  unsigned int bestsad = INT_MAX;
+  unsigned int bestsad = start_mv_sad;
   int best_site = -1;
   int last_site = -1;
 
@@ -2101,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   const int tot_steps = cfg->total_steps - search_param;
 
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
   ref_row = ref_mv->row;
   ref_col = ref_mv->col;
   *num00 = 0;
@@ -2113,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
   best_address = in_what;
 
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
-            mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
   i = 0;
 
   for (step = 0; step < tot_steps; step++) {
@@ -2514,8 +2491,17 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
                               const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
-  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param, sadpb, &n, fn_ptr, ref_mv);
+  int bestsme;
+  unsigned int start_mv_sad;
+  const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  start_mv_sad =
+      get_start_mv_sad(x, mvp_full, &ref_mv_full, fn_ptr->sdf, sadpb);
+
+  bestsme =
+      cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
+                              step_param, sadpb, &n, fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
   *dst_mv = temp_mv;
@@ -2530,9 +2516,9 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param + n, sadpb, &num00, fn_ptr,
-                                        ref_mv);
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
+                                        &temp_mv, step_param + n, sadpb, &num00,
+                                        fn_ptr, ref_mv);
       if (thissme < INT_MAX)
         thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index bdaf2ce77d..62a7a047d4 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -94,9 +94,9 @@ extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 
 typedef int (*vp9_diamond_search_fn_t)(
-    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
-    int search_param, int sad_per_bit, int *num00,
-    const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
+    uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
+    int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
 
 int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                              int search_range,
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 0e04a2f41f..719ab40f90 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -32,29 +32,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
   result.as_mv.col = col;
   return result;
 }
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
-         comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff =
-      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
 /*****************************************************************************
  * This function utilizes 3 properties of the cost function lookup tables,   *
  * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
@@ -72,8 +49,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
  *****************************************************************************/
 int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                                const search_site_config *cfg, MV *ref_mv,
-                               MV *best_mv, int search_param, int sad_per_bit,
-                               int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+                               uint32_t start_mv_sad, MV *best_mv,
+                               int search_param, int sad_per_bit, int *num00,
+                               const vp9_variance_fn_ptr_t *fn_ptr,
                                const MV *center_mv) {
   const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
   const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
@@ -98,8 +76,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
       pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
   const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
 
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+  const int ref_row = ref_mv->row;
+  const int ref_col = ref_mv->col;
 
   int_mv bmv = pack_int_mv(ref_row, ref_col);
   int_mv new_bmv = bmv;
@@ -119,8 +97,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
 #else
   __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
 #endif
-
-  unsigned int best_sad;
+  // Starting position
+  unsigned int best_sad = start_mv_sad;
   int i, j, step;
 
   // Check the prerequisite cost function properties that are easy to check
@@ -129,10 +107,6 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
   assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
   assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
 
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
   *num00 = 0;
 
   for (i = 0, step = 0; step < tot_steps; step++) {

From 35c32b1d223edbf70b61d5fd247d3890acce2025 Mon Sep 17 00:00:00 2001
From: Cherma Rajan A <cherma.rajan@ittiam.com>
Date: Tue, 11 Apr 2023 14:50:18 +0530
Subject: [PATCH 651/926] Add assert to ensure NEARESTMV or NEWMV modes are not
 skipped

Added an assert for prune_single_mode_based_on_mv_diff_mode_rate
speed feature. This ensures NEARMV or ZEROMV modes are pruned
only when NEARESTMV and NEWMV modes are not early terminated.

Change-Id: Id8b03eef6d1ef3f16714a9cbfde0c171c0c6fe0b
---
 vp9/encoder/vp9_rdopt.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 05811bd828..9121eeac15 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2967,8 +2967,14 @@ static int64_t handle_inter_mode(
     // rate.
     if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate,
                                             this_mode, refs[0], *rate2,
-                                            best_mode_index))
+                                            best_mode_index)) {
+      // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes
+      // are not early terminated. This ensures all single modes are not getting
+      // skipped when the speed feature is enabled.
+      assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX ||
+             single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX);
       return INT64_MAX;
+    }
   }
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
       mi->mode != NEARESTMV)

From 232f8659aafec1461cac76f76885c8663755957f Mon Sep 17 00:00:00 2001
From: Deepa K G <deepa.kg@ittiam.com>
Date: Wed, 5 Apr 2023 16:42:54 +0530
Subject: [PATCH 652/926] Downsample SAD computation in motion search

Added a speed feature to skip every other row
in SAD computation during motion search.

                 Instruction Count        BD-Rate Loss(%)
cpu   Resolution   Reduction(%)    avg.psnr   ovr.psnr    ssim
 0       LOWRES2      0.958         0.0204     0.0095    0.0275
 0       MIDRES2      1.891        -0.0636     0.0032    0.0247
 0        HDRES2      2.869         0.0434     0.0345    0.0686
 0       Average      1.905         0.0000     0.0157    0.0403

STATS_CHANGED

Change-Id: I1a8692757ed0cbcb2259729b3ecfb0436cdf49ce
---
 test/sad_test.cc                              | 515 ++++++++++++++
 vp9/common/vp9_rtcd_defs.pl                   |   4 +-
 .../arm/neon/vp9_diamond_search_sad_neon.c    |   6 +-
 vp9/encoder/vp9_encoder.c                     | 647 ++++++++++--------
 vp9/encoder/vp9_firstpass.c                   |   7 +-
 vp9/encoder/vp9_mcomp.c                       |  97 ++-
 vp9/encoder/vp9_mcomp.h                       |  10 +-
 vp9/encoder/vp9_speed_features.c              |   2 +
 vp9/encoder/vp9_speed_features.h              |   4 +
 vp9/encoder/x86/vp9_diamond_search_sad_avx.c  |   6 +-
 vpx_dsp/sad.c                                 |  30 +
 vpx_dsp/variance.h                            |   4 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl                  | 147 ++++
 vpx_dsp/x86/highbd_sad4d_avx2.c               | 313 +++++----
 vpx_dsp/x86/highbd_sad4d_sse2.asm             |  43 +-
 vpx_dsp/x86/highbd_sad_avx2.c                 | 188 +++--
 vpx_dsp/x86/highbd_sad_sse2.asm               |  59 +-
 vpx_dsp/x86/sad4d_avx2.c                      |  66 +-
 vpx_dsp/x86/sad4d_sse2.asm                    |  43 +-
 vpx_dsp/x86/sad_avx2.c                        | 145 ++--
 vpx_dsp/x86/sad_sse2.asm                      |  70 +-
 21 files changed, 1824 insertions(+), 582 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 0896c77f12..561da5ddfb 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -42,6 +42,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride);
 typedef TestParams<SadMxNFunc> SadMxNParam;
 
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride);
+typedef TestParams<SadSkipMxNFunc> SadSkipMxNParam;
+
 typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                       const uint8_t *ref_ptr, int ref_stride,
                                       const uint8_t *second_pred);
@@ -52,6 +56,11 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
                              unsigned int *sad_array);
 typedef TestParams<SadMxNx4Func> SadMxNx4Param;
 
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_ptr[], int ref_stride,
+                                 unsigned int *sad_array);
+typedef TestParams<SadSkipMxNx4Func> SadSkipMxNx4Param;
+
 typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sad_array);
@@ -170,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     return sad;
   }
 
+  // Sum of Absolute Differences Skip rows. Given two blocks, calculate the
+  // absolute difference between two pixels in the same relative location every
+  // other row; accumulate and double the result at the end.
+  uint32_t ReferenceSADSkip(int ref_offset) const {
+    uint32_t sad = 0;
+    const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset);
+    const uint8_t *const source8 = source_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < params_.height; h += 2) {
+      for (int w = 0; w < params_.width; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad * 2;
+  }
+
   // Sum of Absolute Differences Average. Given two blocks, and a prediction
   // calculate the absolute difference between one pixel and average of the
   // corresponding and predicted pixels; accumulate.
@@ -290,6 +327,32 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> {
   }
 };
 
+class SADSkipx4Test : public SADTestBase<SadMxNx4Param> {
+ public:
+  SADSkipx4Test() : SADTestBase(GetParam()) {}
+
+ protected:
+  void SADs(unsigned int *results) const {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(params_.func(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() const {
+    uint32_t reference_sad;
+    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADSkip(GetBlockRefOffset(block));
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+};
+
 class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
  public:
   SADTest() : SADTestBase(GetParam()) {}
@@ -317,6 +380,33 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
   }
 };
 
+class SADSkipTest : public AbstractBench, public SADTestBase<SadMxNParam> {
+ public:
+  SADSkipTest() : SADTestBase(GetParam()) {}
+
+ protected:
+  unsigned int SAD(int block_idx) const {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() const {
+    const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0));
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_);
+  }
+};
+
 class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
  public:
   SADavgTest() : SADTestBase(GetParam()) {}
@@ -397,6 +487,58 @@ TEST_P(SADTest, DISABLED_Speed) {
   PrintMedian(title);
 }
 
+TEST_P(SADSkipTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -554,6 +696,105 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   reference_stride_ = tmp_stride;
 }
 
+TEST_P(SADSkipx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += params_.width;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+  uint32_t reference_sad[4];
+  DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+  vpx_usec_timer timer;
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block));
+  }
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    SADs(exp_sad);
+  }
+  vpx_usec_timer_mark(&timer);
+  for (int block = 0; block < 4; ++block) {
+    EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+  }
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+         bit_depth_, elapsed_time);
+
+  reference_stride_ = tmp_stride;
+}
+
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
@@ -614,6 +855,56 @@ const SadMxNParam c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
+const SadSkipMxNParam skip_c_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
 const SadMxNAvgParam avg_c_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c),
@@ -730,6 +1021,57 @@ const SadMxNx4Param x4d_c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
+const SadSkipMxNx4Param skip_x4d_c_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_c_tests));
+
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_NEON
@@ -956,6 +1298,54 @@ const SadMxNParam sse2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
+const SadSkipMxNParam skip_sse2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+                         ::testing::ValuesIn(skip_sse2_tests));
+
 const SadMxNAvgParam avg_sse2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2),
@@ -1065,6 +1455,57 @@ const SadMxNx4Param x4d_sse2_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_sse2_tests));
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE3
@@ -1113,6 +1554,44 @@ const SadMxNParam avx2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
+const SadSkipMxNParam skip_avx2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx2_tests));
+
 const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2),
@@ -1180,6 +1659,42 @@ const SadMxNx4Param x4d_avx2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx2_tests));
+
 #endif  // HAVE_AVX2
 
 #if HAVE_AVX512
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 5e60792556..4b94c31f15 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -23,7 +23,7 @@ ()
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct vp9_sad_table;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -171,7 +171,7 @@ ()
 #
 # Motion search
 #
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad avx neon/;
 
 #
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 15334b413b..255e6fbc4a 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -49,7 +49,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
                                 const search_site_config *cfg, MV *ref_mv,
                                 uint32_t start_mv_sad, MV *best_mv,
                                 int search_param, int sad_per_bit, int *num00,
-                                const vp9_variance_fn_ptr_t *fn_ptr,
+                                const vp9_sad_fn_ptr_t *sad_fn_ptr,
                                 const MV *center_mv) {
   static const uint32_t data[4] = { 0, 1, 2, 3 };
   const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
@@ -188,8 +188,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
 #endif
       }
 
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
+      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                         in_what_stride, (uint32_t *)&v_sad_d);
 
       // Look up the component cost of the residual motion vector
       {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 4cec02eb93..a0b1f2c7c6 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1561,13 +1561,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                             \
-  cpi->fn_ptr[BT].sdaf = SDAF;                           \
-  cpi->fn_ptr[BT].vf = VF;                               \
-  cpi->fn_ptr[BT].svf = SVF;                             \
-  cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                            \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                          \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                          \
+  cpi->fn_ptr[BT].vf = VF;                                              \
+  cpi->fn_ptr[BT].svf = SVF;                                            \
+  cpi->fn_ptr[BT].svaf = SVAF;                                          \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                      \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -1627,284 +1629,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
   }
 
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d)
 
 static void highbd_set_var_fns(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case VPX_BITS_8:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
-                   vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
-                   vpx_highbd_8_sub_pixel_variance32x16,
-                   vpx_highbd_8_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
-                   vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
-                   vpx_highbd_8_sub_pixel_variance16x32,
-                   vpx_highbd_8_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
-                   vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
-                   vpx_highbd_8_sub_pixel_variance64x32,
-                   vpx_highbd_8_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
-                   vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
-                   vpx_highbd_8_sub_pixel_variance32x64,
-                   vpx_highbd_8_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
-                   vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
-                   vpx_highbd_8_sub_pixel_variance32x32,
-                   vpx_highbd_8_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
-                   vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
-                   vpx_highbd_8_sub_pixel_variance64x64,
-                   vpx_highbd_8_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
-                   vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
-                   vpx_highbd_8_sub_pixel_variance16x16,
-                   vpx_highbd_8_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
-                   vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
-                   vpx_highbd_8_sub_pixel_variance16x8,
-                   vpx_highbd_8_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
-                   vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
-                   vpx_highbd_8_sub_pixel_variance8x16,
-                   vpx_highbd_8_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits8,
+            vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8,
+            vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16,
+            vpx_highbd_8_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits8,
+            vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8,
+            vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32,
+            vpx_highbd_8_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits8,
+            vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8,
+            vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32,
+            vpx_highbd_8_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits8,
+            vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8,
+            vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64,
+            vpx_highbd_8_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits8,
+            vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8,
+            vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32,
+            vpx_highbd_8_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
-            vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
-            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
+            BLOCK_64X64, vpx_highbd_sad64x64_bits8,
+            vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8,
+            vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64,
+            vpx_highbd_8_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
-            vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
-            vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
+            BLOCK_16X16, vpx_highbd_sad16x16_bits8,
+            vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8,
+            vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16,
+            vpx_highbd_8_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
-            vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
-            vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
+            BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+            vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
+            vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
+            vpx_highbd_8_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
-            vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
-            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
+            BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+            vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
+            vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
+            vpx_highbd_8_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
+                   vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
+                   vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8)
         break;
 
       case VPX_BITS_10:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
-                   vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
-                   vpx_highbd_10_sub_pixel_variance32x16,
-                   vpx_highbd_10_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
-                   vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
-                   vpx_highbd_10_sub_pixel_variance16x32,
-                   vpx_highbd_10_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
-                   vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
-                   vpx_highbd_10_sub_pixel_variance64x32,
-                   vpx_highbd_10_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
-                   vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
-                   vpx_highbd_10_sub_pixel_variance32x64,
-                   vpx_highbd_10_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
-                   vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
-                   vpx_highbd_10_sub_pixel_variance32x32,
-                   vpx_highbd_10_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
-                   vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
-                   vpx_highbd_10_sub_pixel_variance64x64,
-                   vpx_highbd_10_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
-                   vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
-                   vpx_highbd_10_sub_pixel_variance16x16,
-                   vpx_highbd_10_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
-                   vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
-                   vpx_highbd_10_sub_pixel_variance16x8,
-                   vpx_highbd_10_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
-                   vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
-                   vpx_highbd_10_sub_pixel_variance8x16,
-                   vpx_highbd_10_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
-                   vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
-                   vpx_highbd_10_sub_pixel_variance8x8,
-                   vpx_highbd_10_sub_pixel_avg_variance8x8,
-                   vpx_highbd_sad8x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
-                   vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
-                   vpx_highbd_10_sub_pixel_variance8x4,
-                   vpx_highbd_10_sub_pixel_avg_variance8x4,
-                   vpx_highbd_sad8x4x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
-                   vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
-                   vpx_highbd_10_sub_pixel_variance4x8,
-                   vpx_highbd_10_sub_pixel_avg_variance4x8,
-                   vpx_highbd_sad4x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
-                   vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
-                   vpx_highbd_10_sub_pixel_variance4x4,
-                   vpx_highbd_10_sub_pixel_avg_variance4x4,
-                   vpx_highbd_sad4x4x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits10,
+            vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10,
+            vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16,
+            vpx_highbd_10_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits10,
+            vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10,
+            vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32,
+            vpx_highbd_10_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits10,
+            vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10,
+            vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32,
+            vpx_highbd_10_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits10,
+            vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10,
+            vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64,
+            vpx_highbd_10_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits10,
+            vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10,
+            vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32,
+            vpx_highbd_10_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits10,
+            vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10,
+            vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64,
+            vpx_highbd_10_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits10,
+            vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10,
+            vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16,
+            vpx_highbd_10_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits10,
+            vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10,
+            vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8,
+            vpx_highbd_10_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits10,
+            vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10,
+            vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16,
+            vpx_highbd_10_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10,
+            vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+            vpx_highbd_10_sub_pixel_variance8x8,
+            vpx_highbd_10_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10,
+            vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
+            vpx_highbd_10_sub_pixel_variance8x4,
+            vpx_highbd_10_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10,
+            vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
+            vpx_highbd_10_sub_pixel_variance4x8,
+            vpx_highbd_10_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10,
+            vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+            vpx_highbd_10_sub_pixel_variance4x4,
+            vpx_highbd_10_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10)
         break;
 
       default:
         assert(cm->bit_depth == VPX_BITS_12);
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
-                   vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
-                   vpx_highbd_12_sub_pixel_variance32x16,
-                   vpx_highbd_12_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
-                   vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
-                   vpx_highbd_12_sub_pixel_variance16x32,
-                   vpx_highbd_12_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
-                   vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
-                   vpx_highbd_12_sub_pixel_variance64x32,
-                   vpx_highbd_12_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
-                   vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
-                   vpx_highbd_12_sub_pixel_variance32x64,
-                   vpx_highbd_12_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
-                   vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
-                   vpx_highbd_12_sub_pixel_variance32x32,
-                   vpx_highbd_12_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
-                   vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
-                   vpx_highbd_12_sub_pixel_variance64x64,
-                   vpx_highbd_12_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
-                   vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
-                   vpx_highbd_12_sub_pixel_variance16x16,
-                   vpx_highbd_12_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
-                   vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
-                   vpx_highbd_12_sub_pixel_variance16x8,
-                   vpx_highbd_12_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
-                   vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
-                   vpx_highbd_12_sub_pixel_variance8x16,
-                   vpx_highbd_12_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
-                   vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
-                   vpx_highbd_12_sub_pixel_variance8x8,
-                   vpx_highbd_12_sub_pixel_avg_variance8x8,
-                   vpx_highbd_sad8x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
-                   vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
-                   vpx_highbd_12_sub_pixel_variance8x4,
-                   vpx_highbd_12_sub_pixel_avg_variance8x4,
-                   vpx_highbd_sad8x4x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
-                   vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
-                   vpx_highbd_12_sub_pixel_variance4x8,
-                   vpx_highbd_12_sub_pixel_avg_variance4x8,
-                   vpx_highbd_sad4x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
-                   vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
-                   vpx_highbd_12_sub_pixel_variance4x4,
-                   vpx_highbd_12_sub_pixel_avg_variance4x4,
-                   vpx_highbd_sad4x4x4d_bits12)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits12,
+            vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12,
+            vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16,
+            vpx_highbd_12_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits12,
+            vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12,
+            vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32,
+            vpx_highbd_12_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits12,
+            vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12,
+            vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32,
+            vpx_highbd_12_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits12,
+            vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12,
+            vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64,
+            vpx_highbd_12_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits12,
+            vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12,
+            vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32,
+            vpx_highbd_12_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits12,
+            vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12,
+            vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64,
+            vpx_highbd_12_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits12,
+            vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12,
+            vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16,
+            vpx_highbd_12_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits12,
+            vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12,
+            vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8,
+            vpx_highbd_12_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits12,
+            vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12,
+            vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16,
+            vpx_highbd_12_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12,
+            vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+            vpx_highbd_12_sub_pixel_variance8x8,
+            vpx_highbd_12_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12,
+            vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
+            vpx_highbd_12_sub_pixel_variance8x4,
+            vpx_highbd_12_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12,
+            vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
+            vpx_highbd_12_sub_pixel_variance4x8,
+            vpx_highbd_12_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12,
+            vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+            vpx_highbd_12_sub_pixel_variance4x4,
+            vpx_highbd_12_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12)
         break;
     }
   }
@@ -2550,61 +2629,67 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
                   vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                      \
-  cpi->fn_ptr[BT].sdaf = SDAF;                    \
-  cpi->fn_ptr[BT].vf = VF;                        \
-  cpi->fn_ptr[BT].svf = SVF;                      \
-  cpi->fn_ptr[BT].svaf = SVAF;                    \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
-
-  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
-      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
-      vpx_sad32x16x4d)
-
-  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
-      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
-      vpx_sad16x32x4d)
-
-  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
-      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
-      vpx_sad64x32x4d)
-
-  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
-      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
-      vpx_sad32x64x4d)
-
-  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
-      vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x4d)
-
-  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
-      vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x4d)
-
-  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
-      vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x4d)
-
-  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
-      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
-      vpx_sad16x8x4d)
-
-  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
-      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
-      vpx_sad8x16x4d)
-
-  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
-
-  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
-
-  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
-
-  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
+#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                     \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                   \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                   \
+  cpi->fn_ptr[BT].vf = VF;                                       \
+  cpi->fn_ptr[BT].svf = SVF;                                     \
+  cpi->fn_ptr[BT].svaf = SVAF;                                   \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                               \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg,
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d)
+
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg,
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d)
+
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg,
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d)
+
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg,
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d)
+
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d)
+
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d)
+
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d)
+
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d)
+
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d)
+
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8,
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+      vpx_sad_skip_8x8x4d)
+
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4,
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+      vpx_sad_skip_8x4x4d)
+
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8,
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+      vpx_sad_skip_4x8x4d)
+
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4,
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+      vpx_sad_skip_4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 0efa836aca..71d8775ea5 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -437,6 +437,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
   MV center_mv_full = ref_mv_full;
   unsigned int start_mv_sad;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
 
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
@@ -462,11 +463,13 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
            x->mv_limits.row_min, x->mv_limits.row_max);
   start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
                                   cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+  sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf;
+  sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df;
 
   // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
                                     &tmp_mv, step_param, x->sadperbit16, &num00,
-                                    &v_fn_ptr, ref_mv);
+                                    &sad_fn_ptr, ref_mv);
   if (tmp_err < INT_MAX)
     tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
   if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
@@ -488,7 +491,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     } else {
       tmp_err = cpi->diamond_search_sad(
           x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
-          x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
+          x->sadperbit16, &num00, &sad_fn_ptr, ref_mv);
       if (tmp_err < INT_MAX)
         tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4ff685b242..64e9ef0f91 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2055,7 +2055,7 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
 int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
                              MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
                              int search_param, int sad_per_bit, int *num00,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const vp9_sad_fn_ptr_t *sad_fn_ptr,
                              const MV *center_mv) {
   int i, j, step;
 
@@ -2117,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
 
         for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
 
-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                       sad_array);
+        sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                           sad_array);
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
@@ -2142,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
         if (is_mv_in(&x->mv_limits, &this_mv)) {
           const uint8_t *const check_here = ss_os[i] + best_address;
           unsigned int thissad =
-              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+              sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -2484,24 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
               point as the best match, we will do a final 1-away diamond
               refining search  */
 static int full_pixel_diamond(const VP9_COMP *const cpi,
-                              const MACROBLOCK *const x, MV *mvp_full,
-                              int step_param, int sadpb, int further_steps,
-                              int do_refine, int *cost_list,
+                              const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                              MV *mvp_full, int step_param, int sadpb,
+                              int further_steps, int do_refine,
+                              int use_downsampled_sad, int *cost_list,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
   int bestsme;
-  unsigned int start_mv_sad;
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  uint8_t *pred_buf;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
+  unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows;
   const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
   clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
            x->mv_limits.row_min, x->mv_limits.row_max);
-  start_mv_sad =
-      get_start_mv_sad(x, mvp_full, &ref_mv_full, fn_ptr->sdf, sadpb);
+
+  pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  start_mv_sad_even_rows =
+      fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad_odd_rows =
+      fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride,
+                   pred_buf + pred_buf_stride, pred_buf_stride);
+  start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1;
+  start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb);
+
+  sad_fn_ptr.sdf = fn_ptr->sdf;
+  sad_fn_ptr.sdx4df = fn_ptr->sdx4df;
+  if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) {
+    // If the absolute difference between the pred-to-src SAD of even rows and
+    // the pred-to-src SAD of odd rows is small, skip every other row in sad
+    // computation.
+    const int odd_to_even_diff_sad =
+        abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+    const int mult_thresh = 10;
+    if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+      sad_fn_ptr.sdf = fn_ptr->sdsf;
+      sad_fn_ptr.sdx4df = fn_ptr->sdsx4df;
+    }
+  }
 
   bestsme =
       cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
-                              step_param, sadpb, &n, fn_ptr, ref_mv);
+                              step_param, sadpb, &n, &sad_fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
   *dst_mv = temp_mv;
@@ -2518,7 +2548,7 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     } else {
       thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
                                         &temp_mv, step_param + n, sadpb, &num00,
-                                        fn_ptr, ref_mv);
+                                        &sad_fn_ptr, ref_mv);
       if (thissme < INT_MAX)
         thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
@@ -2536,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
   if (do_refine) {
     const int search_range = 8;
     MV best_mv = *dst_mv;
-    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
-                                      ref_mv);
+    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+                                      &sad_fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
@@ -2546,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     }
   }
 
+  if (sad_fn_ptr.sdf != fn_ptr->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run search with
+    // skip row features off.
+    const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv);
+    const int sad =
+        fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    const int skip_sad =
+        fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    // We will keep the result of skipping rows if it's good enough.
+    const int kSADThresh =
+        1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb,
+                                further_steps, do_refine, 0, cost_list, fn_ptr,
+                                ref_mv, dst_mv);
+    }
+  }
+
   // Return cost list.
   if (cost_list) {
     calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
@@ -2697,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
 
 int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const vp9_sad_fn_ptr_t *sad_fn_ptr,
                             const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
@@ -2706,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+      sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -2723,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                                             best_address - 1, best_address + 1,
                                             best_address + in_what->stride };
 
-      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+      sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride,
+                         sads);
 
       for (j = 0; j < 4; ++j) {
         if (sads[j] < best_sad) {
@@ -2743,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 
         if (is_mv_in(&x->mv_limits, &mv)) {
           unsigned int sad =
-              fn_ptr->sdf(what->buf, what->stride,
-                          get_buf_from_mv(in_what, &mv), in_what->stride);
+              sad_fn_ptr->sdf(what->buf, what->stride,
+                              get_buf_from_mv(in_what, &mv), in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -2861,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
       break;
     case NSTEP:
     case MESH:
-      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
-                               cost_list, fn_ptr, ref_mv, tmp_mv);
+      var = full_pixel_diamond(
+          cpi, x, bsize, mvp_full, step_param, error_per_bit,
+          MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+          cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default: assert(0 && "Unknown search method");
   }
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 62a7a047d4..fd6a8b9aca 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -41,6 +41,11 @@ typedef struct search_site_config {
   int total_steps;
 } search_site_config;
 
+typedef struct vp9_sad_table {
+  vpx_sad_fn_t sdf;
+  vpx_sad_multi_d_fn_t sdx4df;
+} vp9_sad_fn_ptr_t;
+
 static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                              const MV *mv) {
   return &buf->buf[mv->row * buf->stride + mv->col];
@@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
 
 struct VP9_COMP;
 struct SPEED_FEATURES;
+struct vp9_sad_table;
 
 int vp9_init_search_range(int size);
 
 int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
                             int error_per_bit, int search_range,
-                            const struct vp9_variance_vtable *fn_ptr,
+                            const struct vp9_sad_table *sad_fn_ptr,
                             const struct mv *center_mv);
 
 // Perform integral projection based motion estimation.
@@ -96,7 +102,7 @@ extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 typedef int (*vp9_diamond_search_fn_t)(
     const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
     uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
-    int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+    int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv);
 
 int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                              int search_range,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 3e121b799f..2aa3140052 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -223,6 +223,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->allow_skip_recode = 1;
   sf->less_rectangular_check = 1;
   sf->mv.auto_mv_step_size = 1;
+  sf->mv.use_downsampled_sad = 1;
   sf->prune_ref_frame_for_rect_partitions = 1;
   sf->temporal_filter_search_method = NSTEP;
   sf->tx_size_search_breakout = 1;
@@ -919,6 +920,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
+  sf->mv.use_downsampled_sad = 0;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index d32bf09e4e..7b7290d714 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES {
 
   // This variable sets the step_param used in full pel motion search.
   int fullpel_search_step_param;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 8 rows.
+  int use_downsampled_sad;
 } MV_SPEED_FEATURES;
 
 typedef struct PARTITION_SEARCH_BREAKOUT_THR {
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 719ab40f90..80442e3594 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -51,7 +51,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                                const search_site_config *cfg, MV *ref_mv,
                                uint32_t start_mv_sad, MV *best_mv,
                                int search_param, int sad_per_bit, int *num00,
-                               const vp9_variance_fn_ptr_t *fn_ptr,
+                               const vp9_sad_fn_ptr_t *sad_fn_ptr,
                                const MV *center_mv) {
   const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
   const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
@@ -167,8 +167,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
 #endif
       }
 
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
+      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                         in_what_stride, (uint32_t *)&v_sad_d);
 
       // Look up the component cost of the residual motion vector
       {
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index b47c43430d..619d7aa956 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -43,6 +43,12 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
     DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
     vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
+  }                                                                           \
+  unsigned int vpx_sad_skip_##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m),     \
+                   (n / 2));                                                  \
   }
 
 // Compare |src_ptr| to 4 distinct references in |ref_array[4]|
@@ -54,6 +60,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
     for (i = 0; i < 4; ++i)                                                    \
       sad_array[i] =                                                           \
           vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
+  }                                                                            \
+  void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,   \
+                                     const uint8_t *const ref_array[4],        \
+                                     int ref_stride, uint32_t sad_array[4]) {  \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i],            \
+                             2 * ref_stride, (m), (n / 2));                    \
+    }                                                                          \
   }
 
 /* clang-format off */
@@ -156,6 +171,12 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
     vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
                                n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \
     return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
+  }                                                                            \
+  unsigned int vpx_highbd_sad_skip_##m##x##n##_c(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
   }
 
 #define highbd_sadMxNx4D(m, n)                                                 \
@@ -167,6 +188,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
       sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,          \
                                                  ref_array[i], ref_stride);    \
     }                                                                          \
+  }                                                                            \
+  void vpx_highbd_sad_skip_##m##x##n##x4d_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],   \
+      int ref_stride, uint32_t sad_array[4]) {                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c(                        \
+          src, src_stride, ref_array[i], ref_stride);                          \
+    }                                                                          \
   }
 
 /* clang-format off */
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 755cb907d2..ccdb2f90ba 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -69,11 +69,15 @@ typedef struct variance_vtable {
 #if CONFIG_VP9
 typedef struct vp9_variance_vtable {
   vpx_sad_fn_t sdf;
+  // Same as normal sad, but downsample the rows by a factor of 2.
+  vpx_sad_fn_t sdsf;
   vpx_sad_avg_fn_t sdaf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
   vpx_sad_multi_d_fn_t sdx4df;
+  // Same as sadx4, but downsample the rows by a factor of 2.
+  vpx_sad_multi_d_fn_t sdsx4df;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 49bc9a6309..346097dc79 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -786,6 +786,43 @@ ()
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
 
+add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x64 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x32 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x64 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x32 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x16 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x32 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x16 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x16 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x8 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
 #
 # Avg
 #
@@ -928,6 +965,43 @@ ()
 add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
+add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x64x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x32x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x64x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x32x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x16x4d avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x32x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x16x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x16x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
+add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x8x4d sse2/;
+
+add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
 specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
 
@@ -991,6 +1065,42 @@ ()
   add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad4x4 neon/;
 
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x64 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x32 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x64 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x32 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x16 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x32 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x16 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x8 sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+
   #
   # Avg
   #
@@ -1084,6 +1194,43 @@ ()
   add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
 
+  add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x64x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x32x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x64x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x32x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x16x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x32x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x16x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x8x4d sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x16x4d sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x8x4d sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
+  add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x8x4d sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+
   #
   # Structured Similarity (SSIM)
   #
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
index 947b5e9772..e483fdce73 100644
--- a/vpx_dsp/x86/highbd_sad4d_avx2.c
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 2); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 1;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
 #define HIGHBD_SAD64XNX4D(n)                                                   \
-  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
                                       const uint8_t *const ref_array[4],       \
                                       int ref_stride, uint32_t sad_array[4]) { \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
-    uint16_t *refs[4];                                                         \
-    __m256i sums_16[4];                                                        \
-    __m256i sums_32[4];                                                        \
-    int i;                                                                     \
-                                                                               \
-    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
-    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
-    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
-    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
-    sums_32[0] = _mm256_setzero_si256();                                       \
-    sums_32[1] = _mm256_setzero_si256();                                       \
-    sums_32[2] = _mm256_setzero_si256();                                       \
-    sums_32[3] = _mm256_setzero_si256();                                       \
-                                                                               \
-    for (i = 0; i < (n / 2); ++i) {                                            \
-      sums_16[0] = _mm256_setzero_si256();                                     \
-      sums_16[1] = _mm256_setzero_si256();                                     \
-      sums_16[2] = _mm256_setzero_si256();                                     \
-      sums_16[3] = _mm256_setzero_si256();                                     \
-                                                                               \
-      highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);        \
-                                                                               \
-      /* sums_16 will outrange after 2 rows, so add current sums_16 to         \
-       * sums_32*/                                                             \
-      sums_32[0] = _mm256_add_epi32(                                           \
-          sums_32[0],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
-      sums_32[1] = _mm256_add_epi32(                                           \
-          sums_32[1],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
-      sums_32[2] = _mm256_add_epi32(                                           \
-          sums_32[2],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
-      sums_32[3] = _mm256_add_epi32(                                           \
-          sums_32[3],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
-                                                                               \
-      src += src_stride << 1;                                                  \
-    }                                                                          \
-    calc_final_4(sums_32, sad_array);                                          \
+    highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
   }
 
-// 64x64
-HIGHBD_SAD64XNX4D(64)
-
-// 64x32
-HIGHBD_SAD64XNX4D(32)
+#define HIGHBD_SADSKIP64XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_64x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
                                                const uint16_t *src,
@@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 8); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 3;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
 #define HIGHBD_SAD32XNX4D(n)                                                   \
-  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
                                       const uint8_t *const ref_array[4],       \
                                       int ref_stride, uint32_t sad_array[4]) { \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
-    uint16_t *refs[4];                                                         \
-    __m256i sums_16[4];                                                        \
-    __m256i sums_32[4];                                                        \
-    int i;                                                                     \
-                                                                               \
-    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
-    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
-    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
-    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
-    sums_32[0] = _mm256_setzero_si256();                                       \
-    sums_32[1] = _mm256_setzero_si256();                                       \
-    sums_32[2] = _mm256_setzero_si256();                                       \
-    sums_32[3] = _mm256_setzero_si256();                                       \
-                                                                               \
-    for (i = 0; i < (n / 8); ++i) {                                            \
-      sums_16[0] = _mm256_setzero_si256();                                     \
-      sums_16[1] = _mm256_setzero_si256();                                     \
-      sums_16[2] = _mm256_setzero_si256();                                     \
-      sums_16[3] = _mm256_setzero_si256();                                     \
-                                                                               \
-      highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);        \
-                                                                               \
-      /* sums_16 will outrange after 8 rows, so add current sums_16 to         \
-       * sums_32*/                                                             \
-      sums_32[0] = _mm256_add_epi32(                                           \
-          sums_32[0],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
-      sums_32[1] = _mm256_add_epi32(                                           \
-          sums_32[1],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
-      sums_32[2] = _mm256_add_epi32(                                           \
-          sums_32[2],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
-      sums_32[3] = _mm256_add_epi32(                                           \
-          sums_32[3],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
-                                                                               \
-      src += src_stride << 3;                                                  \
-    }                                                                          \
-    calc_final_4(sums_32, sad_array);                                          \
+    highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
   }
 
-// 32x64
-HIGHBD_SAD32XNX4D(64)
-
-// 32x32
-HIGHBD_SAD32XNX4D(32)
-
-// 32x16
-HIGHBD_SAD32XNX4D(16)
+#define HIGHBD_SADSKIP32XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_32x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
                                                const uint16_t *src,
@@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
-void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                                 const uint8_t *const ref_array[4],
-                                 int ref_stride, uint32_t sad_array[4]) {
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
   uint16_t *refs[4];
   __m256i sums_16[4];
   __m256i sums_32[4];
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
   int i;
 
   refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
@@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums_32[2] = _mm256_setzero_si256();
   sums_32[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 2; ++i) {
+  for (i = 0; i < num_iters; ++i) {
     sums_16[0] = _mm256_setzero_si256();
     sums_16[1] = _mm256_setzero_si256();
     sums_16[2] = _mm256_setzero_si256();
     sums_16[3] = _mm256_setzero_si256();
 
-    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
 
     // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
     sums_32[0] = _mm256_add_epi32(
@@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums_32, sad_array);
 }
 
+#define HIGHBD_SAD16XNX4D(n)                                                   \
+  void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP16XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_16x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
 void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
                                  const uint8_t *const ref_array[4],
                                  int ref_stride, uint32_t sad_array[4]) {
@@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
     calc_final_4(sums_32, sad_array);
   }
 }
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+    // clang-format on
diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm
index 6c2a61e019..a07892d811 100644
--- a/vpx_dsp/x86/highbd_sad4d_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -213,7 +213,12 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0  ; normal sad
 %if UNIX64
 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%else  ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ;
+%endif  ; sad/avg/skip
 
 ; set m1
   push                srcq
@@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   pshufd                m1, m1, 0x0
   pop                 srcq
 
+%if %3 == 2  ; skip rows
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif  ; skip rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   shl                ref1q, 1
 
   HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2  ;  Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
   ; N.B. HIGH_PROCESS outputs dwords (32 bits)
   ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   paddd                 m4, m0
   paddd                 m6, m1
   punpcklqdq            m4, m6
+%if %3 == 2  ; skip rows
+  pslld                 m4, 1
+%endif
   movifnidn             r4, r4mp
   movu                [r4], m4
   RET
@@ -285,3 +312,15 @@ HIGH_SADNXN4D  8,  8
 HIGH_SADNXN4D  8,  4
 HIGH_SADNXN4D  4,  8
 HIGH_SADNXN4D  4,  4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16,  8, 2
+HIGH_SADNXN4D  8, 16, 2
+HIGH_SADNXN4D  8,  8, 2
+HIGH_SADNXN4D  4,  8, 2
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 231b67f809..78f8eb8bfa 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
   }
 }
 
-#define HIGHBD_SAD64XN(n)                                                    \
-  unsigned int vpx_highbd_sad64x##n##_avx2(                                  \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
-    __m256i sums_32 = _mm256_setzero_si256();                                \
-    int i;                                                                   \
-                                                                             \
-    for (i = 0; i < (n / 2); ++i) {                                          \
-      __m256i sums_16 = _mm256_setzero_si256();                              \
-                                                                             \
-      highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);         \
-                                                                             \
-      /* sums_16 will outrange after 2 rows, so add current sums_16 to       \
-       * sums_32*/                                                           \
-      sums_32 = _mm256_add_epi32(                                            \
-          sums_32,                                                           \
-          _mm256_add_epi32(                                                  \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
-              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
-                                                                             \
-      src += src_stride << 1;                                                \
-      ref += ref_stride << 1;                                                \
-    }                                                                        \
-    return calc_final(sums_32);                                              \
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < (n / 2); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
   }
+  return calc_final(sums_32);
+}
 
-// 64x64
-HIGHBD_SAD64XN(64)
+#define HIGHBD_SAD64XN(n)                                                      \
+  unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
 
-// 64x32
-HIGHBD_SAD64XN(32)
+#define HIGHBD_SADSKIP64xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_64x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
@@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
   }
 }
 
-#define HIGHBD_SAD32XN(n)                                                    \
-  unsigned int vpx_highbd_sad32x##n##_avx2(                                  \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
-    __m256i sums_32 = _mm256_setzero_si256();                                \
-    int i;                                                                   \
-                                                                             \
-    for (i = 0; i < (n / 8); ++i) {                                          \
-      __m256i sums_16 = _mm256_setzero_si256();                              \
-                                                                             \
-      highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);         \
-                                                                             \
-      /* sums_16 will outrange after 8 rows, so add current sums_16 to       \
-       * sums_32*/                                                           \
-      sums_32 = _mm256_add_epi32(                                            \
-          sums_32,                                                           \
-          _mm256_add_epi32(                                                  \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
-              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
-                                                                             \
-      src += src_stride << 3;                                                \
-      ref += ref_stride << 3;                                                \
-    }                                                                        \
-    return calc_final(sums_32);                                              \
-  }
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
 
-// 32x64
-HIGHBD_SAD32XN(64)
+  for (i = 0; i < (n / 8); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
 
-// 32x32
-HIGHBD_SAD32XN(32)
+    highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
 
-// 32x16
-HIGHBD_SAD32XN(16)
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 3;
+    ref += ref_stride << 3;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n)                                                      \
+  unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP32xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_32x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
@@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
   }
 }
 
-unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
-                                      const uint8_t *ref_ptr, int ref_stride) {
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
   __m256i sums_32 = _mm256_setzero_si256();
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
   int i;
 
-  for (i = 0; i < 2; ++i) {
+  for (i = 0; i < num_iters; ++i) {
     __m256i sums_16 = _mm256_setzero_si256();
 
-    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
 
     // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
     sums_32 = _mm256_add_epi32(
@@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
   return calc_final(sums_32);
 }
 
+#define HIGHBD_SAD16XN(n)                                                      \
+  unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP16xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_16x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
 unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
                                       const uint8_t *ref_ptr, int ref_stride) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
@@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
   }
 }
 
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
 // AVG -------------------------------------------------------------------------
 static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
                                                 const uint16_t *src,
diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm
index 6a1a6f3d62..62ad2237ff 100644
--- a/vpx_dsp/x86/highbd_sad_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -12,6 +12,11 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro HIGH_SAD_FN 4
 %if %4 == 0
 %if %3 == 5
@@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
 %if %3 == 5
 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%else  ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2  ; double the stride if we are skipping rows
+  lea          src_strided, [src_strided*2]
+  lea          ref_strided, [ref_strided*2]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD64XN 1-2 0
   HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
 
 
 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD32XN 1-2 0
   HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
 
 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD16XN 1-2 0
   HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -292,13 +334,19 @@ HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
 
 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
   HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -350,6 +398,9 @@ HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -361,3 +412,5 @@ HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 399b67b3fb..c87fd3cd27 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
   _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   int i;
   const uint8_t *refs[4];
   __m256i sums[4];
@@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums[2] = _mm256_setzero_si256();
   sums[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 32; i++) {
+  for (i = 0; i < h; i++) {
     __m256i r[4];
 
     // load src and all ref[]
@@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums, sad_array);
 }
 
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   __m256i sums[4];
   int i;
   const uint8_t *refs[4];
@@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums[2] = _mm256_setzero_si256();
   sums[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 64; i++) {
+  for (i = 0; i < h; i++) {
     __m256i r_lo[4], r_hi[4];
     // load 64 bytes from src and all ref[]
     const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
@@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
 
   calc_final_4(sums, sad_array);
 }
+
+#define SAD64_H(h)                                                          \
+  void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    sad64xhx4d_avx2(src, src_stride, ref, ref_stride, h, res);              \
+  }
+
+#define SAD32_H(h)                                                          \
+  void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    sad32xhx4d_avx2(src, src_stride, ref, ref_stride, h, res);              \
+  }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h)                                                       \
+  void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,  \
+                                     const uint8_t *const ref[4],         \
+                                     int ref_stride, uint32_t res[4]) {   \
+    sad64xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \
+                    res);                                                 \
+    res[0] <<= 1;                                                         \
+    res[1] <<= 1;                                                         \
+    res[2] <<= 1;                                                         \
+    res[3] <<= 1;                                                         \
+  }
+
+#define SADS32_H(h)                                                       \
+  void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,  \
+                                     const uint8_t *const ref[4],         \
+                                     int ref_stride, uint32_t res[4]) {   \
+    sad32xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \
+                    res);                                                 \
+    res[0] <<= 1;                                                         \
+    res[1] <<= 1;                                                         \
+    res[2] <<= 1;                                                         \
+    res[3] <<= 1;                                                         \
+  }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm
index 3f6e55ce9a..ed4ea3ef9b 100644
--- a/vpx_dsp/x86/sad4d_sse2.asm
+++ b/vpx_dsp/x86/sad4d_sse2.asm
@@ -179,13 +179,27 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 1  ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+%else  ; normal sad
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
 %else
 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
+%endif
+%endif
+%if %3 == 1
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
 %endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
@@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   mov                ref1q, [ref1q+gprsize*0]
 
   PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 1  ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
 
 %if %1 > 4
@@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   punpckhqdq            m5, m7
   movifnidn             r4, r4mp
   paddd                 m4, m5
+%if %3 == 1
+  pslld                 m4, 1
+%endif
   movu                [r4], m4
   RET
 %else
   movifnidn             r4, r4mp
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
+%if %3 == 1
+  pslld                 m6, 1
+  pslld                 m7, 1
+%endif
   movq              [r4+0], m6
   movq              [r4+8], m7
   RET
@@ -237,3 +264,15 @@ SADNXN4D  8,  8
 SADNXN4D  8,  4
 SADNXN4D  4,  8
 SADNXN4D  4,  4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16,  8, 1
+SADNXN4D  8, 16, 1
+SADNXN4D  8,  8, 1
+SADNXN4D  4,  8, 1
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index 29bedb0e6e..e00494d766 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -11,73 +11,104 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  const int ref2_stride = ref_stride << 1;
+  const int src2_stride = src_stride << 1;
+  const int max = h >> 1;
+  for (i = 0; i < max; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref2_stride;
+    src_ptr += src2_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
 #define FSAD64_H(h)                                                           \
   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i;                                                                    \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
+    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS64_H(h)                                                          \
+  unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD32_H(h)                                                           \
   unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS32_H(h)                                                          \
+  unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
-#define FSAD64 \
-  FSAD64_H(64) \
-  FSAD64_H(32)
+#define FSAD64  \
+  FSAD64_H(64)  \
+  FSAD64_H(32)  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
 
-#define FSAD32 \
-  FSAD32_H(64) \
-  FSAD32_H(32) \
-  FSAD32_H(16)
+#define FSAD32  \
+  FSAD32_H(64)  \
+  FSAD32_H(32)  \
+  FSAD32_H(16)  \
+  FSADS32_H(64) \
+  FSADS32_H(32) \
+  FSADS32_H(16)
 
 FSAD64
 FSAD32
@@ -86,6 +117,8 @@ FSAD32
 #undef FSAD32
 #undef FSAD64_H
 #undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
 
 #define FSADAVG64_H(h)                                                        \
   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index e4e1bc3e98..627e463bf8 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -12,15 +12,29 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
 %if %3 == 5
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea           src_strided, [src_strided*2]
+lea           ref_strided, [ref_strided*2]
+%endif ; %4 skip
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN  64, 2  ; sad64x64_skip_sse2
+SAD64XN  32, 2  ; sad64x32_skip_sse2
 
 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
 
 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -177,12 +221,19 @@ SAD16XN  8 ; sad16x8_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN  8, 2 ; sad16x8_skip_sse2
 
 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -210,6 +261,9 @@ SAD16XN  8, 1 ; sad16x8_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -221,12 +275,18 @@ SAD8XN  4 ; sad8x4_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN  8, 2 ; sad8x8_skip_sse2
 
 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -257,6 +317,9 @@ SAD8XN  4, 1 ; sad8x4_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -266,3 +329,4 @@ SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
+SAD4XN  8, 2 ; sad4x8_skip_sse

From e3c458149cd46eefb601ec684deb0352b52b77a1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 11 Apr 2023 19:16:28 -0700
Subject: [PATCH 653/926] vp9_mbgraph: clear -Wshadow warnings

Bug: webm:1793
Change-Id: Ibffb62775f09922d37f7d0460aa2751e74c36738
---
 vp9/encoder/vp9_mbgraph.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 7c2790cb98..9487fc5fae 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -98,8 +98,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
-    unsigned int tmp_err;
-    MV zero_ref_mv = { 0, 0 }, tmp_mv;
+    MV zero_ref_mv = { 0, 0 };
 
     tmp_err =
         do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col);

From bde26b99611b5534ad4a67990817882e74a723fe Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 11 Apr 2023 19:23:27 -0700
Subject: [PATCH 654/926] vp9_ratectrl: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I2476a9d8e1d62414fdbe6feee87d5167058f499b
---
 vp9/encoder/vp9_ratectrl.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index d9207f7a2f..9e152629fb 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1150,8 +1150,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   if (frame_is_intra_only(cm)) {
     if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
-      int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex =
+          vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else if (rc->this_key_frame_forced) {
       // Handle the special case for key frames forced when we have reached
@@ -1206,12 +1207,14 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
 
     } else if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex;
       if (cpi->refresh_alt_ref_frame)
-        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+        delta_qindex =
+            vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth);
       else
-        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+        delta_qindex =
+            vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
@@ -1219,11 +1222,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   } else {
     if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
                                                0.70, 1.0, 0.85, 1.0 };
       int delta_qindex = vp9_compute_qdelta(
-          rc, q, q * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+          rc, qstart,
+          qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
           cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
@@ -1859,8 +1863,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
     if (cpi->use_svc) {
-      int i = 0;
-      SVC *svc = &cpi->svc;
+      int i;
       for (i = 0; i < svc->number_temporal_layers; ++i) {
         const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
                                            svc->number_temporal_layers);

From f254e6da84d564a5fbd2da7e0e1b31d81ba1dfba Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 11 Apr 2023 19:27:03 -0700
Subject: [PATCH 655/926] vp9_speed_features: clear -Wshadow warning

Bug: webm:1793
Change-Id: I9f509c4461631e358f80b98afbb745ce88e9d7a2
---
 vp9/encoder/vp9_speed_features.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0522d4ec97..034673b491 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -396,7 +396,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   }
 
   if (speed >= 5) {
-    int i;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;

From aaffc6e306a2b5d6aeb0b673677a3b1bc1a6ff6d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 13:31:35 -0700
Subject: [PATCH 656/926] vp9_pickmode: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I26c063818144d11c4c91165c3fcbf6f258453cc7
---
 vp9/encoder/vp9_pickmode.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index c19d57d15d..fa88cd79da 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -566,23 +566,26 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
     // Transform skipping test in UV planes.
     for (i = 1; i <= 2; i++) {
-      struct macroblock_plane *const p = &x->plane[i];
-      struct macroblockd_plane *const pd = &xd->plane[i];
-      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+      struct macroblock_plane *const p_uv = &x->plane[i];
+      struct macroblockd_plane *const pd_uv = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv);
       const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv);
       const int uv_bw = b_width_log2_lookup[uv_bsize];
       const int uv_bh = b_height_log2_lookup[uv_bsize];
       const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
                      (uv_bh - b_height_log2_lookup[unit_size]);
-      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
-      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      const uint32_t uv_dc_thr =
+          pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr =
+          pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf);
       int j = i - 1;
 
       vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
       flag_preduv_computed[i - 1] = 1;
-      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
-          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride,
+                                           pd_uv->dst.buf, pd_uv->dst.stride,
+                                           &sse_uv[j]);
 
       if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
           (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
@@ -1933,15 +1936,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
       svc->spatial_layer_id > 0 && !gf_temporal_ref) {
     if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-      struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) {
+      struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+      if (vp9_is_scaled(ref_sf)) {
         svc_force_zero_mode[LAST_FRAME - 1] = 1;
         inter_layer_ref = LAST_FRAME;
       }
     }
     if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) {
+      struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+      if (vp9_is_scaled(ref_sf)) {
         svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
         inter_layer_ref = GOLDEN_FRAME;
       }
@@ -2772,9 +2775,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
     if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
         (yv12 != NULL)) {
       int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
-      const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
-      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
-                           sf);
+      const struct scale_factors *const ref_sf =
+          &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf,
+                           ref_sf);
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
                        mbmi_ext->mode_context);
 

From 2513f6d5f4c9af6e3d715acf83c0e25d1560398e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 13:41:46 -0700
Subject: [PATCH 657/926] vp9_svc_layercontext: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I63669de9835713ec70dafa88ca8f2c2459e59698
---
 vp9/encoder/vp9_svc_layercontext.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 83b6e5c99d..f08d668203 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -107,7 +107,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       RATE_CONTROL *const lrc = &lc->rc;
-      int i;
       lc->current_video_frame_in_layer = 0;
       lc->layer_size = 0;
       lc->frames_from_key_frame = 0;
@@ -799,9 +798,9 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
       for (sl = svc->number_spatial_layers - 1;
            sl >= svc->first_spatial_layer_to_encode; sl--) {
         int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id;
-        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-        cpi->rc = lc->rc;
-        cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+        LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer];
+        cpi->rc = sl_lc->rc;
+        cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth;
         if (vp9_test_drop(cpi)) {
           int sl2;
           // Set flag to force drop in encoding for this mode.
@@ -1050,17 +1049,17 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
   int sl, tl;
   for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
     // Check for reset based on avg_frame_bandwidth for spatial layer sl.
-    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
-                                 svc->number_temporal_layers);
-    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    const int spatial_layer_idx = LAYER_IDS_TO_IDX(
+        sl, svc->number_temporal_layers - 1, svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx];
     RATE_CONTROL *lrc = &lc->rc;
     if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
         lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
       // Reset for all temporal layers with spatial layer sl.
       for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
-        int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
-        LAYER_CONTEXT *lc = &svc->layer_context[layer];
-        RATE_CONTROL *lrc = &lc->rc;
+        int temporal_layer_idx =
+            LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        lrc = &svc->layer_context[temporal_layer_idx].rc;
         lrc->rc_1_frame = 0;
         lrc->rc_2_frame = 0;
         lrc->bits_off_target = lrc->optimal_buffer_level;

From 39a6b6c1364cf9c03ce947f9a98b40b63aef28ca Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 13:46:26 -0700
Subject: [PATCH 658/926] vp9_temporal_filter: clear -Wshadow warnings

Bug: webm:1793
Change-Id: Ia681ce636ae99f95b875ee1b0189bc6fa66a7608
---
 vp9/encoder/vp9_temporal_filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 8af30c42aa..986553a4a8 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -450,8 +450,6 @@ void vp9_highbd_apply_temporal_filter_c(
   // Apply the filter to luma
   for (row = 0; row < (int)block_height; row++) {
     for (col = 0; col < (int)block_width; col++) {
-      const int uv_row = row >> ss_y;
-      const int uv_col = col >> ss_x;
       const int filter_weight = get_filter_weight(
           row, col, block_height, block_width, blk_fw, use_32x32);
 
@@ -476,6 +474,8 @@ void vp9_highbd_apply_temporal_filter_c(
 
       // Sum the corresponding uv pixels to the current y modifier
       // Note we are rounding down instead of rounding to the nearest pixel.
+      uv_row = row >> ss_y;
+      uv_col = col >> ss_x;
       y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col];
       y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col];
 

From ff4123215df48bfb6e90eac1691dc70611446dfb Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 13:54:02 -0700
Subject: [PATCH 659/926] vp9_frame_scale_ssse3: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I85608ac7bb6d3a61649ba342c13c3bf6a39a5dea
---
 vp9/encoder/x86/vp9_frame_scale_ssse3.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index bf0e8b121f..94506aad0f 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -469,18 +469,18 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
   // It's used to choose the src offset and filter coefficient offset.
   const int offset_idx1 = (offset1_q4 >> 4) & 1;
   const int offset_idx2 = (offset2_q4 >> 4) & 1;
-  static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
+  static const shuffle_filter_funcs kShuffleFilterFuncs[2] = {
     shuffle_filter_ssse3, shuffle_filter_odd_ssse3
   };
-  static const convolve8_funcs convolve8_funcs[2] = {
+  static const convolve8_funcs kConvolve8Funcs[2] = {
     convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
   };
 
   assert(w && h);
 
   shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
-  shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
-  shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+  kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
 
   // Sub 64 to avoid overflow.
   // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
@@ -522,11 +522,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
       // 04 14 24 34 44 54 64 74
       // 05 15 25 35 45 55 65 75
       d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
-      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
-      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
       d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
-      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
-      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
 
       // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
       // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
@@ -598,11 +598,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
       loadu_8bit_16x4(t, stride_hor, &s[4]);
 
       d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
-      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
-      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
       d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
-      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
-      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
 
       // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
       // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37

From 698eb779f27d93b0c577009358cc94b1d4770b8e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 13:56:39 -0700
Subject: [PATCH 660/926] convolve_test: clear -Wshadow warning

Bug: webm:1793
Change-Id: I22db73cb756c6c680b73684caef1e08bb6e729d8
---
 test/convolve_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index d569048691..5a17d80894 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -244,7 +244,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
 
   // Vertical pass (transposed intermediate -> dst).
   {
-    uint16_t *src_ptr = intermediate_buffer;
+    src_ptr = intermediate_buffer;
     const int dst_next_row_stride = dst_stride - output_width;
     unsigned int i, j;
     for (i = 0; i < output_height; ++i) {

From 968960c7b346c05c4a41bffc4f48952c4ef127b8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 14:00:11 -0700
Subject: [PATCH 661/926] dct_test: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I571a9d641b2f7f4b9d7c473ca815d4ea10b9f9af
---
 test/dct_test.cc | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/test/dct_test.cc b/test/dct_test.cc
index 9a150a24f1..235c407237 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -358,14 +358,6 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
     ASSERT_TRUE(in.Init());
     Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16);
     ASSERT_TRUE(coeff.Init());
-    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(dst.Init());
-    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0);
-    ASSERT_TRUE(src.Init());
-    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(dst16.Init());
-    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0);
-    ASSERT_TRUE(src16.Init());
 
     for (int i = 0; i < count_test_block; ++i) {
       InitMem();

From a3eb39ab6f22a09d06591db050bb07ede95fcd88 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 14:02:15 -0700
Subject: [PATCH 662/926] svc_encodeframe: clear -Wshadow warnings

Bug: webm:1793
Change-Id: Ib65a2dff124034d8e653572f8ada65984e55ed70
---
 examples/svc_encodeframe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c
index c2b3ec9798..1dd731765c 100644
--- a/examples/svc_encodeframe.c
+++ b/examples/svc_encodeframe.c
@@ -381,7 +381,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_iface_t *iface,
                              vpx_codec_enc_cfg_t *enc_cfg) {
   vpx_codec_err_t res;
-  int i, sl, tl;
+  int sl, tl;
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
       enc_cfg == NULL) {
@@ -433,7 +433,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-      i = sl * svc_ctx->temporal_layers + tl;
+      const int i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
       if (enc_cfg->rc_end_usage == VPX_CBR &&
@@ -503,7 +503,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
 
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-      i = sl * svc_ctx->temporal_layers + tl;
+      const int i = sl * svc_ctx->temporal_layers + tl;
       if (enc_cfg->rc_end_usage == VPX_CBR &&
           enc_cfg->g_pass == VPX_RC_ONE_PASS) {
         si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer;

From 556e4f6cadef38727ab2e83050915a5eee6584a4 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 14:52:33 -0700
Subject: [PATCH 663/926] vpxdec: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I0b7f013682229cde50df7c62db9dab6eab0fd341
---
 vpxdec.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vpxdec.c b/vpxdec.c
index 84cef7dfd4..54a41f0799 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -996,7 +996,7 @@ static int main_loop(int argc, const char **argv_) {
 
       if (single_file) {
         if (use_y4m) {
-          char buf[Y4M_BUFFER_SIZE] = { 0 };
+          char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
           size_t len = 0;
           if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) {
             fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
@@ -1005,21 +1005,22 @@ static int main_loop(int argc, const char **argv_) {
           if (frame_out == 1) {
             // Y4M file header
             len = y4m_write_file_header(
-                buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height,
-                &vpx_input_ctx.framerate, img->fmt, img->bit_depth);
+                y4m_buf, sizeof(y4m_buf), vpx_input_ctx.width,
+                vpx_input_ctx.height, &vpx_input_ctx.framerate, img->fmt,
+                img->bit_depth);
             if (do_md5) {
-              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+              MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
             } else {
-              fputs(buf, outfile);
+              fputs(y4m_buf, outfile);
             }
           }
 
           // Y4M frame header
-          len = y4m_write_frame_header(buf, sizeof(buf));
+          len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
           if (do_md5) {
-            MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+            MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
           } else {
-            fputs(buf, outfile);
+            fputs(y4m_buf, outfile);
           }
         } else {
           if (frame_out == 1) {

From 6c65608253ef6ff7786416d54985d0e27a286798 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 14:52:44 -0700
Subject: [PATCH 664/926] vpxenc: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I2a26c9297016d3fa2c32e8974ef3d7dab1e524c4
---
 vpxenc.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vpxenc.c b/vpxenc.c
index 61672acadd..38d69a1923 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1586,13 +1586,14 @@ static void test_decode(struct stream_state *stream,
   /* Get the internal reference frame */
   if (strcmp(codec->name, "vp8") == 0) {
     struct vpx_ref_frame ref_enc, ref_dec;
-    int width, height;
+    int aligned_width = (stream->config.cfg.g_w + 15) & ~15;
+    int aligned_height = (stream->config.cfg.g_h + 15) & ~15;
 
-    width = (stream->config.cfg.g_w + 15) & ~15;
-    height = (stream->config.cfg.g_h + 15) & ~15;
-    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);
+    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, aligned_width, aligned_height,
+                  1);
     enc_img = ref_enc.img;
-    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);
+    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, aligned_width, aligned_height,
+                  1);
     dec_img = ref_dec.img;
 
     ref_enc.frame_type = VP8_LAST_FRAME;
@@ -1969,10 +1970,9 @@ int main(int argc, const char **argv_) {
           } else {
             const int64_t input_pos = ftello(input.file);
             const int64_t input_pos_lagged = input_pos - lagged_count;
-            const int64_t limit = input.length;
 
             rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0;
-            remaining = limit - input_pos + lagged_count;
+            remaining = input.length - input_pos + lagged_count;
           }
 
           average_rate =

From 536c9867644c2307986345efb10f3b566158ab63 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 13 Apr 2023 16:52:51 -0400
Subject: [PATCH 665/926] Add VP8RateControlRTC::GetLoopfilterLevel

New linear model to calculate loopfilter level from frame qp.

Linear regression was done on qvga, vga, and hd clips.

Bug: b/275304642
Change-Id: I552b312212bb4de21b53b762d139aa9588c64ae2
---
 vp8/vp8_ratectrl_rtc.cc | 28 ++++++++++++++++++++++++++++
 vp8/vp8_ratectrl_rtc.h  |  3 +++
 2 files changed, 31 insertions(+)

diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index 65c58536aa..60bc258a6f 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -294,6 +294,34 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
 
 int VP8RateControlRTC::GetQP() const { return q_; }
 
+int VP8RateControlRTC::GetLoopfilterLevel() const {
+  VP8_COMMON *cm = &cpi_->common;
+  const double qp = q_;
+
+  // This model is from linear regression
+  if (cm->Width * cm->Height <= 320 * 240) {
+    cm->filter_level = static_cast<int>(0.352685 * qp + 2.957774);
+  } else if (cm->Width * cm->Height <= 640 * 480) {
+    cm->filter_level = static_cast<int>(0.485069 * qp - 0.534462);
+  } else {
+    cm->filter_level = static_cast<int>(0.314875 * qp + 7.959003);
+  }
+
+  int min_filter_level = 0;
+  // This logic is from get_min_filter_level() in picklpf.c
+  if (q_ > 6 && q_ <= 16) {
+    min_filter_level = 1;
+  } else {
+    min_filter_level = (q_ / 8);
+  }
+
+  const int max_filter_level = 63;
+  if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level;
+  if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level;
+
+  return cm->filter_level;
+}
+
 void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
   VP8_COMMON *const cm = &cpi_->common;
   vpx_clear_system_state();
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index a8a886c56e..496ef9eaad 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -42,6 +42,9 @@ class VP8RateControlRTC {
   bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
+  // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
+  // level is calculated from frame qp.
+  int GetLoopfilterLevel() const;
   // int GetLoopfilterLevel() const;
   void ComputeQP(const VP8FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame

From dca0a8b86052f9ed55173a315cef97c584b5e1cb Mon Sep 17 00:00:00 2001
From: "L. E. Segovia" <amy@amyspark.me>
Date: Mon, 10 Apr 2023 19:08:54 -0300
Subject: [PATCH 666/926] libs.mk: Fix wrong scope end comments

I believe the following comments are wrongly scoped, possibly left over
from previous changesets. This made me very confused when reading the
test suite Makefile, in order to port it to Meson.

Change-Id: Ice3c7ba50c6909a9c7dfd4001afa1e1ddfa4b5ce
---
 libs.mk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs.mk b/libs.mk
index 1f7f03aa38..92cf5509fb 100644
--- a/libs.mk
+++ b/libs.mk
@@ -631,8 +631,8 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
             -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^
 endif  # RC_INTERFACE_TEST
-endif  # CONFIG_VP9_ENCODER
-endif
+endif  # CONFIG_ENCODERS
+endif  # CONFIG_MSVS
 else
 
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
@@ -699,7 +699,7 @@ $(eval $(call linkerxx_template,$(SIMPLE_ENCODE_TEST_BIN), \
               -L. -lsimple_encode -lvpx -lgtest $(extralibs) -lm))
 endif  # SIMPLE_ENCODE_TEST
 
-endif  # CONFIG_UNIT_TESTS
+endif  # CONFIG_EXTERNAL_BUILD
 
 # Install test sources only if codec source is included
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
@@ -724,7 +724,7 @@ NUM_SHARDS := 10
 SHARDS := 0 1 2 3 4 5 6 7 8 9
 $(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS))))
 
-endif
+endif  # CONFIG_UNIT_TESTS
 
 ##
 ## documentation directives

From bdba4591a7daaeb8068fcb99d86d010fa59d6c94 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Apr 2023 11:47:49 -0700
Subject: [PATCH 667/926] vp9_rdcost: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I6d48038d74e510ecb5773dfffbdc4c10b765c2aa
---
 vp9/encoder/vp9_rdopt.c | 157 ++++++++++++++++++++--------------------
 1 file changed, 77 insertions(+), 80 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9121eeac15..c68cfefdea 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -588,15 +588,15 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 
     if (x->skip_encode && !is_inter_block(xd->mi[0])) {
       // TODO(jingning): tune the model to better capture the distortion.
-      const int64_t p =
+      const int64_t mean_quant_error =
           (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
 #if CONFIG_VP9_HIGHBITDEPTH
           (shift + 2 + (bd - 8) * 2);
 #else
           (shift + 2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      *out_dist += (p >> 4);
-      *out_sse += p;
+      *out_dist += (mean_quant_error >> 4);
+      *out_sse += mean_quant_error;
     }
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
@@ -785,13 +785,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
       const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
       const int16_t *const diff =
           &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-      const int enable_trellis_opt =
+      const int use_trellis_opt =
           do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize,
                          tx_size, &encode_b_arg);
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-      if (enable_trellis_opt)
-        vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
+      if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
                  tx_size, &dist, &sse, recon, sse_calc_done);
     } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
@@ -1436,7 +1435,6 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   if (ref_best_rd < 0) is_cost_valid = 0;
 
   if (is_inter_block(mi) && is_cost_valid) {
-    int plane;
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       vp9_subtract_plane(x, bsize, plane);
   }
@@ -2070,7 +2068,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
 static int64_t rd_pick_best_sub8x8_mode(
     VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv,
-    int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
+    int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate,
     int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
     int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf,
     int filter_idx, int mi_row, int mi_col) {
@@ -2103,7 +2101,7 @@ static int64_t rd_pick_best_sub8x8_mode(
 
   vp9_zero(*bsi);
 
-  bsi->segment_rd = best_rd;
+  bsi->segment_rd = best_rd_so_far;
   bsi->ref_mv[0] = best_ref_mv;
   bsi->ref_mv[1] = second_best_ref_mv;
   bsi->mvp.as_int = best_ref_mv->as_int;
@@ -2129,14 +2127,14 @@ static int64_t rd_pick_best_sub8x8_mode(
       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
       PREDICTION_MODE mode_selected = ZEROMV;
       int64_t best_rd = INT64_MAX;
-      const int i = idy * 2 + idx;
+      const int block = idy * 2 + idx;
       int ref;
 
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
         frame_mv[ZEROMV][frame].as_int = 0;
         vp9_append_sub8x8_mvs_for_idx(
-            cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
+            cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
             &frame_mv[NEARMV][frame], mbmi_ext->mode_context);
       }
 
@@ -2146,7 +2144,7 @@ static int64_t rd_pick_best_sub8x8_mode(
         struct buf_2d orig_pre[2];
 
         mode_idx = INTER_OFFSET(this_mode);
-        bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        bsi->rdstat[block][mode_idx].brdcost = INT64_MAX;
         if (!(inter_mode_mask & (1 << this_mode))) continue;
 
         if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
@@ -2154,14 +2152,14 @@ static int64_t rd_pick_best_sub8x8_mode(
           continue;
 
         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
-        memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
-               sizeof(bsi->rdstat[i][mode_idx].ta));
-        memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
-               sizeof(bsi->rdstat[i][mode_idx].tl));
+        memcpy(bsi->rdstat[block][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[block][mode_idx].ta));
+        memcpy(bsi->rdstat[block][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[block][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
-            seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) {
+            seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) {
           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
           uint32_t bestsme = UINT_MAX;
@@ -2177,12 +2175,13 @@ static int64_t rd_pick_best_sub8x8_mode(
 
           if (cpi->oxcf.mode != BEST) {
             // use previous block's result as next block's MV predictor.
-            if (i > 0) {
-              bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
-              if (i == 2) bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
+            if (block > 0) {
+              bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int;
+              if (block == 2)
+                bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int;
             }
           }
-          if (i == 0)
+          if (block == 0)
             max_mv = x->max_mv_context[mi->ref_frame[0]];
           else
             max_mv =
@@ -2211,7 +2210,7 @@ static int64_t rd_pick_best_sub8x8_mode(
           }
 
           // adjust src pointer for this block
-          mi_buf_shift(x, i);
+          mi_buf_shift(x, block);
 
           vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
 
@@ -2234,7 +2233,7 @@ static int64_t rd_pick_best_sub8x8_mode(
                 cpi->sf.use_accurate_subpel_search);
 
             // save motion search result for use in compound prediction
-            seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
+            seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv;
           }
 
           x->pred_mv[mi->ref_frame[0]] = *new_mv;
@@ -2244,40 +2243,40 @@ static int64_t rd_pick_best_sub8x8_mode(
         }
 
         if (has_second_rf) {
-          if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV ||
-              seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV)
+          if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV)
             continue;
         }
 
         if (has_second_rf && this_mode == NEWMV &&
             mi->interp_filter == EIGHTTAP) {
           // adjust src pointers
-          mi_buf_shift(x, i);
+          mi_buf_shift(x, block);
           if (sf->comp_inter_joint_search_thresh <= bsize) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
-                                mi_col, seg_mvs[i], &rate_mv);
-            seg_mvs[i][mi->ref_frame[0]].as_int =
+                                mi_col, seg_mvs[block], &rate_mv);
+            seg_mvs[block][mi->ref_frame[0]].as_int =
                 frame_mv[this_mode][mi->ref_frame[0]].as_int;
-            seg_mvs[i][mi->ref_frame[1]].as_int =
+            seg_mvs[block][mi->ref_frame[1]].as_int =
                 frame_mv[this_mode][mi->ref_frame[1]].as_int;
           }
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        bsi->rdstat[i][mode_idx].brate = set_and_cost_bmi_mvs(
-            cpi, x, xd, i, this_mode, mode_mv[this_mode], frame_mv, seg_mvs[i],
-            bsi->ref_mv, x->nmvjointcost, x->mvcost);
+        bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs(
+            cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv,
+            seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
-          bsi->rdstat[i][mode_idx].mvs[ref].as_int =
+          bsi->rdstat[block][mode_idx].mvs[ref].as_int =
               mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
+            bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
+            bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
         }
 
@@ -2295,7 +2294,7 @@ static int64_t rd_pick_best_sub8x8_mode(
           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
             have_ref &= mode_mv[this_mode][ref].as_int ==
-                        ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+                        ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
           }
 
           if (filter_idx > 1 && !subpelmv && !have_ref) {
@@ -2303,53 +2302,55 @@ static int64_t rd_pick_best_sub8x8_mode(
             have_ref = 1;
             for (ref = 0; ref < 1 + has_second_rf; ++ref)
               have_ref &= mode_mv[this_mode][ref].as_int ==
-                          ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+                          ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
           }
 
           if (!subpelmv && have_ref &&
-              ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-            memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
-                   sizeof(SEG_RDSTAT));
+              ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+            memcpy(&bsi->rdstat[block][mode_idx],
+                   &ref_bsi->rdstat[block][mode_idx], sizeof(SEG_RDSTAT));
             if (num_4x4_blocks_wide > 1)
-              bsi->rdstat[i + 1][mode_idx].eobs =
-                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+              bsi->rdstat[block + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[block + 1][mode_idx].eobs;
             if (num_4x4_blocks_high > 1)
-              bsi->rdstat[i + 2][mode_idx].eobs =
-                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+              bsi->rdstat[block + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[block + 2][mode_idx].eobs;
 
-            if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+            if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
               mode_selected = this_mode;
-              best_rd = bsi->rdstat[i][mode_idx].brdcost;
+              best_rd = bsi->rdstat[block][mode_idx].brdcost;
             }
             continue;
           }
         }
 
-        bsi->rdstat[i][mode_idx].brdcost = encode_inter_mb_segment(
-            cpi, x, bsi->segment_rd - this_segment_rd, i,
-            &bsi->rdstat[i][mode_idx].byrate, &bsi->rdstat[i][mode_idx].bdist,
-            &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta,
-            bsi->rdstat[i][mode_idx].tl, mi_row, mi_col);
-        if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-          bsi->rdstat[i][mode_idx].brdcost +=
-              RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0);
-          bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
+        bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment(
+            cpi, x, bsi->segment_rd - this_segment_rd, block,
+            &bsi->rdstat[block][mode_idx].byrate,
+            &bsi->rdstat[block][mode_idx].bdist,
+            &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta,
+            bsi->rdstat[block][mode_idx].tl, mi_row, mi_col);
+        if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+          bsi->rdstat[block][mode_idx].brdcost += RDCOST(
+              x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0);
+          bsi->rdstat[block][mode_idx].brate +=
+              bsi->rdstat[block][mode_idx].byrate;
+          bsi->rdstat[block][mode_idx].eobs = p->eobs[block];
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
+            bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1];
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
+            bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2];
         }
 
-        if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+        if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
           mode_selected = this_mode;
-          best_rd = bsi->rdstat[i][mode_idx].brdcost;
+          best_rd = bsi->rdstat[block][mode_idx].brdcost;
         }
       } /*for each 4x4 mode*/
 
       if (best_rd == INT64_MAX) {
         int iy, midx;
-        for (iy = i + 1; iy < 4; ++iy)
+        for (iy = block + 1; iy < 4; ++iy)
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
@@ -2357,22 +2358,22 @@ static int64_t rd_pick_best_sub8x8_mode(
       }
 
       mode_idx = INTER_OFFSET(mode_selected);
-      memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
-      memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+      memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left));
 
-      set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
-                           frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
-                           x->mvcost);
+      set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected,
+                           mode_mv[mode_selected], frame_mv, seg_mvs[block],
+                           bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
-      br += bsi->rdstat[i][mode_idx].brate;
-      bd += bsi->rdstat[i][mode_idx].bdist;
-      block_sse += bsi->rdstat[i][mode_idx].bsse;
-      segmentyrate += bsi->rdstat[i][mode_idx].byrate;
-      this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+      br += bsi->rdstat[block][mode_idx].brate;
+      bd += bsi->rdstat[block][mode_idx].bdist;
+      block_sse += bsi->rdstat[block][mode_idx].bsse;
+      segmentyrate += bsi->rdstat[block][mode_idx].byrate;
+      this_segment_rd += bsi->rdstat[block][mode_idx].brdcost;
 
       if (this_segment_rd > bsi->segment_rd) {
         int iy, midx;
-        for (iy = i + 1; iy < 4; ++iy)
+        for (iy = block + 1; iy < 4; ++iy)
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
@@ -2390,7 +2391,7 @@ static int64_t rd_pick_best_sub8x8_mode(
   // update the coding decisions
   for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
 
-  if (bsi->segment_rd > best_rd) return INT64_MAX;
+  if (bsi->segment_rd > best_rd_so_far) return INT64_MAX;
   /* set it to the best */
   for (i = 0; i < 4; i++) {
     mode_idx = INTER_OFFSET(bsi->modes[i]);
@@ -2635,9 +2636,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
           tmp_mv->as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
-            int i;
-            for (i = 0; i < MAX_MB_PLANE; ++i)
-              xd->plane[i].pre[0] = backup_yv12[i];
+            int j;
+            for (j = 0; j < MAX_MB_PLANE; ++j)
+              xd->plane[j].pre[0] = backup_yv12[j];
           }
           return;
         }
@@ -4352,7 +4353,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
-    int i;
     int this_skip2 = 0;
     int64_t total_sse = INT_MAX;
     int early_term = 0;
@@ -4513,7 +4513,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
                 : NULL;
 
         if (scaled_ref_frame[ref]) {
-          int i;
           // Swap out the reference frame for a version that's been scaled to
           // match the resolution of the current frame, allowing the existing
           // motion search code to be used without additional modifications.
@@ -4657,7 +4656,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
                               &uv_sse, BLOCK_8X8, tmp_best_rdu)) {
           for (ref = 0; ref < 2; ++ref) {
             if (scaled_ref_frame[ref]) {
-              int i;
               for (i = 0; i < MAX_MB_PLANE; ++i)
                 xd->plane[i].pre[ref] = backup_yv12[ref][i];
             }
@@ -4674,7 +4672,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
       for (ref = 0; ref < 2; ++ref) {
         if (scaled_ref_frame[ref]) {
           // Restore the prediction frame pointers to their unscaled versions.
-          int i;
           for (i = 0; i < MAX_MB_PLANE; ++i)
             xd->plane[i].pre[ref] = backup_yv12[ref][i];
         }

From e15c2e34451b4177e4119cce47bf73dac9864de8 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Tue, 4 Apr 2023 17:24:27 +0530
Subject: [PATCH 668/926] Add AVX2 intrinsic for vpx_fdct16x16() function

Introduced AVX2 intrinsic to compute FDCT for block size
16x16 case. This is a bit-exact change.

Please check the module level scaling w.r.t C function (timer based)
for existing (SSE2) and new AVX2 intrinsics:

   Scaling
SSE2      AVX2
3.88x     5.95x

Change-Id: I02299c3746fcb52d808e2a75d30aa62652c816dc
---
 test/dct16x16_test.cc        |  48 +++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 vpx_dsp/x86/fwd_txfm_avx2.c  | 373 +++++++++++++++++++++++++++++++++++
 3 files changed, 422 insertions(+), 1 deletion(-)

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index d4ef7ae13d..3c104f3a44 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -27,6 +27,7 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -548,6 +549,44 @@ class Trans16x16TestBase {
     }
   }
 
+  void RunSpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int c_sum_time = 0;
+    int simd_sum_time = 0;
+
+    DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]);
+
+    // Initialize a test block with input range [-mask_, mask_].
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    vpx_usec_timer timer_c;
+    vpx_usec_timer_start(&timer_c);
+    for (int i = 0; i < count_test_block; ++i) {
+      vpx_fdct16x16_c(input_block, output_ref_block, pitch_);
+    }
+    vpx_usec_timer_mark(&timer_c);
+    c_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_c));
+
+    vpx_usec_timer timer_mod;
+    vpx_usec_timer_start(&timer_mod);
+    for (int i = 0; i < count_test_block; ++i) {
+      RunFwdTxfm(input_block, output_block, pitch_);
+    }
+
+    vpx_usec_timer_mark(&timer_mod);
+    simd_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_mod));
+
+    printf(
+        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+        simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   void CompareInvReference(IdctFunc ref_txfm, int thresh) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 10000;
@@ -664,6 +703,8 @@ TEST_P(Trans16x16DCT, QuantCheck) {
 
 TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 
+TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); }
+
 class Trans16x16HT : public Trans16x16TestBase,
                      public ::testing::TestWithParam<Ht16x16Param> {
  public:
@@ -823,6 +864,13 @@ INSTANTIATE_TEST_SUITE_P(
                                  3, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, Trans16x16DCT,
+    ::testing::Values(make_tuple(&vpx_fdct16x16_avx2,
+                                 &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 49bc9a6309..f825e5a399 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -597,7 +597,7 @@ ()
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 neon sse2 msa lsx/;
+  specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c
index a2ed420e37..c8f54a49cb 100644
--- a/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -8,9 +8,382 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <immintrin.h>  // AVX2
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/txfm_common.h"
+#define ADD256_EPI16 _mm256_add_epi16
+#define SUB256_EPI16 _mm256_sub_epi16
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size, int pass) {
+  int i;
+  const __m256i kOne = _mm256_set1_epi16(1);
+  if (pass == 0) {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+      // x = x << 2
+      out[i] = _mm256_slli_epi16(out[i], 2);
+    }
+  } else {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));
+      // x = (x + 1) >> 2
+      out[i] = _mm256_add_epi16(out[i], kOne);
+      out[i] = _mm256_srai_epi16(out[i], 2);
+    }
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+// Store 8 16-bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        tran_low_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  int i;
+  for (i = 0; i < out_size; ++i) {
+    _mm256_storeu_si256((__m256i *)(out), in[i]);
+    out += stride;
+  }
+}
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE __m256i mult256_round_shift(const __m256i *pin0,
+                                          const __m256i *pin1,
+                                          const __m256i *pmultiplier,
+                                          const __m256i *prounding,
+                                          const int shift) {
+  const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);
+  const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);
+  const __m256i v0 = _mm256_add_epi32(u0, *prounding);
+  const __m256i v1 = _mm256_add_epi32(u1, *prounding);
+  const __m256i w0 = _mm256_srai_epi32(v0, shift);
+  const __m256i w1 = _mm256_srai_epi32(v1, shift);
+  return _mm256_packs_epi32(w0, w1);
+}
+
+static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {
+  int i;
+  __m256i step2[4];
+  __m256i in[8];
+  __m256i step1[8];
+  __m256i step3[8];
+
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+  const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);
+  const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);
+  const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);
+  const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);
+  const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);
+  const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);
+  const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);
+  const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);
+  const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);
+  const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);
+  const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);
+  const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);
+  const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);
+  const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);
+  const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+
+  // Calculate input for the first 8 results.
+  for (i = 0; i < 8; i++) {
+    in[i] = ADD256_EPI16(input[i], input[15 - i]);
+  }
+
+  // Calculate input for the next 8 results.
+  for (i = 0; i < 8; i++) {
+    step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);
+  }
+
+  // Work on the first eight values; fdct8(input, even_results);
+  {
+    // Add/subtract
+    const __m256i q0 = ADD256_EPI16(in[0], in[7]);
+    const __m256i q1 = ADD256_EPI16(in[1], in[6]);
+    const __m256i q2 = ADD256_EPI16(in[2], in[5]);
+    const __m256i q3 = ADD256_EPI16(in[3], in[4]);
+    const __m256i q4 = SUB256_EPI16(in[3], in[4]);
+    const __m256i q5 = SUB256_EPI16(in[2], in[5]);
+    const __m256i q6 = SUB256_EPI16(in[1], in[6]);
+    const __m256i q7 = SUB256_EPI16(in[0], in[7]);
+
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m256i r0 = ADD256_EPI16(q0, q3);
+      const __m256i r1 = ADD256_EPI16(q1, q2);
+      const __m256i r2 = SUB256_EPI16(q1, q2);
+      const __m256i r3 = SUB256_EPI16(q0, q3);
+
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      {
+        const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);
+        const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);
+        const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);
+        const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);
+
+        output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[12] =
+            mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      }
+    }
+
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);
+      const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);
+      const __m256i r0 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      const __m256i r1 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+
+      {
+        // Add/subtract
+        const __m256i x0 = ADD256_EPI16(q4, r0);
+        const __m256i x1 = SUB256_EPI16(q4, r0);
+        const __m256i x2 = SUB256_EPI16(q7, r1);
+        const __m256i x3 = ADD256_EPI16(q7, r1);
+
+        // Interleave to do the multiply by constants which gets us
+        // into 32 bits.
+        {
+          const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);
+          const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);
+          const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);
+          const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);
+          output[2] =
+              mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[14] =
+              mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[10] =
+              mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[6] =
+              mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        }
+      }
+    }
+  }
+  // Work on the next eight values; step1 -> odd_results
+  {  // step 2
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 3
+    {
+      step3[0] = ADD256_EPI16(step1[0], step2[1]);
+      step3[1] = ADD256_EPI16(step1[1], step2[0]);
+      step3[2] = SUB256_EPI16(step1[1], step2[0]);
+      step3[3] = SUB256_EPI16(step1[0], step2[1]);
+      step3[4] = SUB256_EPI16(step1[7], step2[3]);
+      step3[5] = SUB256_EPI16(step1[6], step2[2]);
+      step3[6] = ADD256_EPI16(step1[6], step2[2]);
+      step3[7] = ADD256_EPI16(step1[7], step2[3]);
+    }
+    // step 4
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 5
+    {
+      step1[0] = ADD256_EPI16(step3[0], step2[0]);
+      step1[1] = SUB256_EPI16(step3[0], step2[0]);
+      step1[2] = ADD256_EPI16(step3[3], step2[1]);
+      step1[3] = SUB256_EPI16(step3[3], step2[1]);
+      step1[4] = SUB256_EPI16(step3[4], step2[3]);
+      step1[5] = ADD256_EPI16(step3[4], step2[3]);
+      step1[6] = SUB256_EPI16(step3[7], step2[2]);
+      step1[7] = ADD256_EPI16(step3[7], step2[2]);
+    }
+    // step 6
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);
+      output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);
+      output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+  }
+}
+
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  DECLARE_ALIGNED(32, int16_t, intermediate[256]);
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  const int width = 16;
+  const int height = 16;
+  __m256i buf0[16], buf1[16];
+
+  // Two transform and transpose passes
+  // Process 16 columns (transposed rows in second pass) at a time.
+  for (pass = 0; pass < 2; ++pass) {
+    // Load and pre-condition input.
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);
+
+    // Calculate dct for 16x16 values
+    fdct16x16_1D_avx2(buf1, buf0);
+
+    // Transpose the results.
+    transpose_16bit_16x16_avx2(buf0, buf1);
+
+    if (pass == 0) {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height);
+    } else {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);
+    }
+    // Setup in/out for next pass.
+    input = intermediate;
+  }
+}
+
 #if !CONFIG_VP9_HIGHBITDEPTH
 #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0

From 7bdce0887b7e1acd62093cd2315ce0e93e75ba5f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 17 Apr 2023 21:57:59 -0700
Subject: [PATCH 669/926] onyx_if: clear -Wshadow warning

with --enable-internal-stats

Bug: webm:1793
Change-Id: I9d375e4cb45f78b82afe455f2c7ad2b56e217f7d
---
 vp8/encoder/onyx_if.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index bcf5227029..44a02b6ddc 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2109,7 +2109,6 @@ void vp8_remove_compressor(VP8_COMP **comp) {
       double time_encoded =
           (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
           10000000.000;
-      double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
 
       if (cpi->b_calculate_psnr) {
         if (cpi->oxcf.number_of_layers > 1) {
@@ -2138,6 +2137,7 @@ void vp8_remove_compressor(VP8_COMP **comp) {
                     total_psnr2, total_ssim);
           }
         } else {
+          double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
           double samples =
               3.0 / 2 * cpi->count * cpi->common.Width * cpi->common.Height;
           double total_psnr =

From eef765751a52ad12c6e681db71ad58e9b9c26c2e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 17 Apr 2023 22:01:10 -0700
Subject: [PATCH 670/926] mr_dissim: clear -Wshadow warning

Bug: webm:1793
Change-Id: I73ced43aba45215264134f917fd69ab0b1f10d01
---
 vp8/encoder/mr_dissim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c
index 011b62a08f..b1bfb4b54a 100644
--- a/vp8/encoder/mr_dissim.c
+++ b/vp8/encoder/mr_dissim.c
@@ -49,7 +49,6 @@ void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) {
 
 void vp8_cal_dissimilarity(VP8_COMP *cpi) {
   VP8_COMMON *cm = &cpi->common;
-  int i;
 
   /* Note: The first row & first column in mip are outside the frame, which
    * were initialized to all 0.(ref_frame, mode, mv...)
@@ -67,6 +66,7 @@ void vp8_cal_dissimilarity(VP8_COMP *cpi) {
     store_info->frame_type = cm->frame_type;
 
     if (cm->frame_type != KEY_FRAME) {
+      int i;
       store_info->is_frame_dropped = 0;
       for (i = 1; i < MAX_REF_FRAMES; ++i)
         store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i];

From d725bdd8a1fc34346245b89a27eb0b377fe73119 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 17 Apr 2023 22:05:46 -0700
Subject: [PATCH 671/926] vp9_tpl_model: clear -Wshadow warning

with --enable-experimental --enable-non-greedy-mv

Bug: webm:1793
Change-Id: I19e38d7196291ae1ffbb5fb3daa70a4fefd54c55
---
 vp9/encoder/vp9_tpl_model.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 53ef356981..624bb1901f 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -1089,10 +1089,6 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   int64_t recon_error, sse;
-#if CONFIG_NON_GREEDY_MV
-  int square_block_idx;
-  int rf_idx;
-#endif
 
   // Setup scaling factor
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1133,21 +1129,25 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
   vp9_frame_init_quantizer(cpi);
 
 #if CONFIG_NON_GREEDY_MV
-  for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
-       ++square_block_idx) {
-    BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
-    build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
-  }
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-    if (ref_frame_idx != -1) {
-      MotionField *motion_field = vp9_motion_field_info_get_motion_field(
-          &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-      predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
-                          tpl_frame, rf_idx, bsize);
+  {
+    int square_block_idx;
+    int rf_idx;
+    for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+         ++square_block_idx) {
+      BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
+      build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
+    }
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+            &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+        predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
+                            tpl_frame, rf_idx, bsize);
+      }
     }
   }
-#endif
+#endif  // CONFIG_NON_GREEDY_MV
 
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {

From 7b7f84fe148168532bbf9add7b738d125588c926 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 12 Apr 2023 14:35:50 +0100
Subject: [PATCH 672/926] Add Neon implementation of vpx_sad_skip_<w>x<h>
 functions

Add Neon implementations of standard bitdepth downsampling SAD
functions for all block sizes.

Also add corresponding unit tests.

Change-Id: Ibda734c270278d947673ffcc29ef17a2f4970b01
---
 test/sad_test.cc             | 18 ++++++++++++++++++
 vpx_dsp/arm/sad_neon.c       | 30 ++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl | 24 +++++++++++++-----------
 3 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 561da5ddfb..e43d9ac41e 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1129,6 +1129,24 @@ const SadMxNParam neon_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
+const SadSkipMxNParam skip_neon_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_neon),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon),
+  SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon),
+  SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_tests));
+
 const SadMxNAvgParam avg_neon_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 9382b80626..566a1f81db 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -250,6 +250,36 @@ SAD_WXH_NEON(32, 64)
 SAD_WXH_NEON(64, 32)
 SAD_WXH_NEON(64, 64)
 
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h)                                                \
+  unsigned int vpx_sad_skip_##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_NEON
+
 #if defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e3d48f493e..05d031b8cd 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -787,41 +787,43 @@ ()
 specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_64x64 avx2 sse2/;
+specialize qw/vpx_sad_skip_64x64 neon avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_64x32 avx2 sse2/;
+specialize qw/vpx_sad_skip_64x32 neon avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_32x64 avx2 sse2/;
+specialize qw/vpx_sad_skip_32x64 neon avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_32x32 avx2 sse2/;
+specialize qw/vpx_sad_skip_32x32 neon avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_32x16 avx2 sse2/;
+specialize qw/vpx_sad_skip_32x16 neon avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_16x32 sse2/;
+specialize qw/vpx_sad_skip_16x32 neon sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_16x16 sse2/;
+specialize qw/vpx_sad_skip_16x16 neon sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_16x8 sse2/;
+specialize qw/vpx_sad_skip_16x8 neon sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_8x16 sse2/;
+specialize qw/vpx_sad_skip_8x16 neon sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_8x8 sse2/;
+specialize qw/vpx_sad_skip_8x8 neon sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x4 neon/;
 
 add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_4x8 sse2/;
+specialize qw/vpx_sad_skip_4x8 neon sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x4 neon/;
 
 #
 # Avg

From 05b244af52e87ff7dacce78a6db3eab1765e84c8 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 12 Apr 2023 14:48:21 +0100
Subject: [PATCH 673/926] Add Neon implementation of
 vpx_highbd_sad_skip_<w>x<h> functions

Add Neon implementations of high bitdepth downsampling SAD functions
for all block sizes.

Also add corresponding unit tests.

Change-Id: I56ea656e9bb5f8b2aedfdc4637c9ab4e1951b31b
---
 test/sad_test.cc              | 43 ++++++++++++++++++++++++++++++++++-
 vpx_dsp/arm/highbd_sad_neon.c | 30 ++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl  | 23 +++++++++++--------
 3 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index e43d9ac41e..eae23cbada 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1142,7 +1142,48 @@ const SadSkipMxNParam skip_neon_tests[] = {
   SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon),
   SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon),
   SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon),
-  SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon)
+  SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 8),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 8),
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 10),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 10),
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 12),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 12),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
                          ::testing::ValuesIn(skip_neon_tests));
diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
index 813710040b..b99bac66cd 100644
--- a/vpx_dsp/arm/highbd_sad_neon.c
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -179,6 +179,36 @@ HBD_SAD_WXH_NEON(32, 64)
 HBD_SAD_WXH_NEON(64, 32)
 HBD_SAD_WXH_NEON(64, 64)
 
+#undef HBD_SAD_WXH_NEON
+
+#define HBD_SAD_SKIP_WXH_NEON(w, h)                             \
+  unsigned int vpx_highbd_sad_skip_##w##x##h##_neon(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,   \
+      int ref_stride) {                                         \
+    return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \
+                                      2 * ref_stride, (h) / 2); \
+  }
+
+HBD_SAD_SKIP_WXH_NEON(4, 4)
+HBD_SAD_SKIP_WXH_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_NEON(8, 4)
+HBD_SAD_SKIP_WXH_NEON(8, 8)
+HBD_SAD_SKIP_WXH_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_NEON(16, 8)
+HBD_SAD_SKIP_WXH_NEON(16, 16)
+HBD_SAD_SKIP_WXH_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_NEON(32, 16)
+HBD_SAD_SKIP_WXH_NEON(32, 32)
+HBD_SAD_SKIP_WXH_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_NEON(64, 32)
+HBD_SAD_SKIP_WXH_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_NEON
+
 static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
                                               int src_stride,
                                               const uint8_t *ref_ptr,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 05d031b8cd..7bea738952 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1068,40 +1068,43 @@ ()
   specialize qw/vpx_highbd_sad4x4 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_64x64 sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_64x64 neon sse2 avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_64x32 sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_64x32 neon sse2 avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_32x64 sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_32x64 neon sse2 avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_32x32 sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_32x32 neon sse2 avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_32x16 sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_32x16 neon sse2 avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_16x32 sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_16x32 neon sse2 avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_16x16 sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_16x16 neon sse2 avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_16x8 sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_16x8 neon sse2 avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_8x16 sse2/;
+  specialize qw/vpx_highbd_sad_skip_8x16 neon sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad_skip_8x8 sse2/;
+  specialize qw/vpx_highbd_sad_skip_8x8 neon sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x4 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_4x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_4x4 neon/;
 
   #
   # Avg

From 42c0cbb9cb114af824083f4e6f0e757985b8942f Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 12 Apr 2023 17:38:24 +0100
Subject: [PATCH 674/926] Add Neon implementation of vpx_sad_skip_<w>x<h>x4d
 functions

Add Neon implementations of standard bitdepth downsampling SAD4D
functions for all block sizes.

Also add corresponding unit tests.

Change-Id: Ieb77661ea2bbe357529862a5fb54956e34e8d758
---
 test/sad_test.cc             | 18 ++++++++++++++++++
 vpx_dsp/arm/sad4d_neon.c     | 32 ++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl | 24 +++++++++++++-----------
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index eae23cbada..32787db79d 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1300,6 +1300,24 @@ const SadMxNx4Param x4d_neon_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+
+const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_neon),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_neon),
+  SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon),
+  SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon),
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_tests));
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 6ad6c96214..44cd990280 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -308,3 +308,35 @@ SAD_WXH_4D_NEON(64, 32)
 SAD_WXH_4D_NEON(64, 64)
 
 #undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h)                                          \
+  void vpx_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+                                        const uint8_t *const ref[4],        \
+                                        int ref_stride, uint32_t res[4]) {  \
+    sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res,       \
+                       ((h) >> 1));                                         \
+    res[0] <<= 1;                                                           \
+    res[1] <<= 1;                                                           \
+    res[2] <<= 1;                                                           \
+    res[3] <<= 1;                                                           \
+  }
+
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
+
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
+
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
+
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
+
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 7bea738952..4c5fab3189 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -968,41 +968,43 @@ ()
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_64x64x4d avx2 sse2/;
+specialize qw/vpx_sad_skip_64x64x4d neon avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_64x32x4d avx2 sse2/;
+specialize qw/vpx_sad_skip_64x32x4d neon avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_32x64x4d avx2 sse2/;
+specialize qw/vpx_sad_skip_32x64x4d neon avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_32x32x4d avx2 sse2/;
+specialize qw/vpx_sad_skip_32x32x4d neon avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_32x16x4d avx2 sse2/;
+specialize qw/vpx_sad_skip_32x16x4d neon avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_16x32x4d sse2/;
+specialize qw/vpx_sad_skip_16x32x4d neon sse2/;
 
 add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_16x16x4d sse2/;
+specialize qw/vpx_sad_skip_16x16x4d neon sse2/;
 
 add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_16x8x4d sse2/;
+specialize qw/vpx_sad_skip_16x8x4d neon sse2/;
 
 add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_8x16x4d sse2/;
+specialize qw/vpx_sad_skip_8x16x4d neon sse2/;
 
 add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_8x8x4d sse2/;
+specialize qw/vpx_sad_skip_8x8x4d neon sse2/;
 
 add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x4x4d neon/;
 
 add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_4x8x4d sse2/;
+specialize qw/vpx_sad_skip_4x8x4d neon sse2/;
 
 add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x4x4d neon/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
 specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;

From ab830fe6a1272bf84fdbc3337cf161f3dd433ce1 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 12 Apr 2023 17:50:01 +0100
Subject: [PATCH 675/926] Add Neon implementations of
 vpx_highbd_sad_skip_<w>x<h>x4d

Add Neon implementations of high bitdepth downsampling SAD4D
functions for all block sizes.

Also add corresponding unit tests.

Change-Id: Ib0c2f852e269cbd6cbb8f4dfb54349654abb0adb
---
 test/sad_test.cc                | 38 +++++++++++++++++++++++++++++++++
 vpx_dsp/arm/highbd_sad4d_neon.c | 34 +++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl    | 24 +++++++++++----------
 3 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 32787db79d..92b3a14d68 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1315,6 +1315,44 @@ const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
   SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon),
   SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon),
   SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 8),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 8),
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 10),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 10),
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 12),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 12),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
                          ::testing::ValuesIn(skip_x4d_neon_tests));
diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c
index 280d2087f7..62c4685a7a 100644
--- a/vpx_dsp/arm/highbd_sad4d_neon.c
+++ b/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -236,3 +236,37 @@ HBD_SAD_WXH_4D_NEON(32, 64)
 
 HBD_SAD_WXH_4D_NEON(64, 32)
 HBD_SAD_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_WXH_4D_NEON
+
+#define HBD_SAD_SKIP_WXH_4D_NEON(w, h)                                       \
+  void vpx_highbd_sad_skip_##w##x##h##x4d_neon(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref[4],       \
+      int ref_stride, uint32_t res[4]) {                                     \
+    highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \
+                              ((h) >> 1));                                   \
+    res[0] <<= 1;                                                            \
+    res[1] <<= 1;                                                            \
+    res[2] <<= 1;                                                            \
+    res[3] <<= 1;                                                            \
+  }
+
+HBD_SAD_SKIP_WXH_4D_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_4D_NEON
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 4c5fab3189..bde0115298 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1202,41 +1202,43 @@ ()
   specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_64x64x4d sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/;
 
   add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_64x32x4d sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/;
 
   add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_32x64x4d sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/;
 
   add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_32x32x4d sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/;
 
   add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_32x16x4d sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/;
 
   add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_16x32x4d sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/;
 
   add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_16x16x4d sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/;
 
   add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_16x8x4d sse2 avx2/;
+  specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/;
 
   add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_8x16x4d sse2/;
+  specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/;
 
   add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_8x8x4d sse2/;
+  specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/;
 
   add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x4x4d neon/;
 
   add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad_skip_4x8x4d sse2/;
+  specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/;
 
   add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x4x4d neon/;
 
   #
   # Structured Similarity (SSIM)

From 933cf345dd0c09cb55e862d6413773b18e2f1404 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 19 Apr 2023 13:51:05 -0700
Subject: [PATCH 676/926] onyx_if,encode_frame_to_data_rate: rm unused var

quiets -Wunused-but-set-variable with clang-17

Change-Id: Ia819beac84cbd57f4eeca6174c785fd320bc40c6
---
 vp8/encoder/onyx_if.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 44a02b6ddc..a780048073 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3203,7 +3203,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int frame_under_shoot_limit;
 
   int Loop = 0;
-  int loop_count;
 
   VP8_COMMON *cm = &cpi->common;
   int active_worst_qchanged = 0;
@@ -3769,8 +3768,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   vp8_save_coding_context(cpi);
 
-  loop_count = 0;
-
   scale_and_extend_source(cpi->un_scaled_source, cpi);
 
 #if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC
@@ -3993,7 +3990,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         q_low = cpi->active_best_quality;
         q_high = cpi->active_worst_quality;
 
-        loop_count++;
         Loop = 1;
 
         continue;
@@ -4219,7 +4215,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
     if (Loop == 1) {
       vp8_restore_coding_context(cpi);
-      loop_count++;
 #if CONFIG_INTERNAL_STATS
       cpi->tot_recode_hits++;
 #endif

From 84b4dfa5ba9c588b13da60ace9c1ba5b85eeaa41 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 19 Apr 2023 13:53:34 -0700
Subject: [PATCH 677/926] vp9_encodeframe: rm unused vars

in get_rdmult_delta() and compute_frame_aq_offset().

quiets -Wunused-but-set-variable with clang-17

Change-Id: I726852f3bc42afa80a18475de910040a9436b0bb
---
 vp9/encoder/vp9_encodeframe.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 26e419e3d5..3a042399cb 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3710,7 +3710,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
   int row, col;
 
   int dr = 0;
-  int count = 0;
   double r0, rk, beta;
 
   TplDepFrame *tpl_frame;
@@ -3734,8 +3733,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
 
       intra_cost += this_stats->intra_cost;
       mc_dep_cost += this_stats->mc_dep_cost;
-
-      ++count;
     }
   }
 
@@ -6185,7 +6182,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
 
   int mi_row, mi_col;
   int sum_delta = 0;
-  int map_index = 0;
   int qdelta_index;
   int segment_id;
 
@@ -6195,7 +6191,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
       segment_id = mi_8x8[0]->segment_id;
       qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
       sum_delta += qdelta_index;
-      map_index++;
     }
     mi_8x8_ptr += cm->mi_stride;
   }

From 895317cdf122124a47fc2bfb4478504169c1bc3c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 19 Apr 2023 13:55:35 -0700
Subject: [PATCH 678/926] vp9_ratectrl,vp9_encodedframe_overshoot: rm unused
 var

quiets -Wunused-but-set-variable with clang-17

Change-Id: I5212a20286d0252e45a8e8813d15cb780494b0ad
---
 vp9/encoder/vp9_ratectrl.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 9e152629fb..13b43aa63a 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -3272,11 +3272,9 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
       MODE_INFO **mi = cm->mi_grid_visible;
       int sum_intra_usage = 0;
       int mi_row, mi_col;
-      int tot = 0;
       for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
         for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
           if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++;
-          tot++;
           mi++;
         }
         mi += 8;

From 4366ff722297e7e57c158db9b41d61cf1a056bf6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 19 Apr 2023 13:58:39 -0700
Subject: [PATCH 679/926] vp9_spatial_svc_encoder: quiet
 -Wunused-but-set-variable

with clang-17. Move frames_received under OUTPUT_FRAME_STATS; it's only
used in a printf.

Change-Id: Idfdd59ccd04e43df1855203db82bb4c8a1d059fb
---
 examples/vp9_spatial_svc_encoder.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index d287e58319..9d37ed0244 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "./y4minput.h"
 
+#define OUTPUT_FRAME_STATS 0
 #define OUTPUT_RC_STATS 1
 
 #define SIMULCAST_MODE 0
@@ -880,7 +881,9 @@ int main(int argc, const char **argv) {
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
   int end_of_stream = 0;
+#if OUTPUT_FRAME_STATS
   int frames_received = 0;
+#endif
 #if OUTPUT_RC_STATS
   VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL };
   struct RateControlStats rc;
@@ -1126,14 +1129,14 @@ int main(int argc, const char **argv) {
             }
 #endif
           }
-          /*
+#if OUTPUT_FRAME_STATS
           printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                  !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
                  (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-          */
+          ++frames_received;
+#endif
           if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
             si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
-          ++frames_received;
 #if CONFIG_VP9_DECODER && !SIMULCAST_MODE
           if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf,
                                (unsigned int)cx_pkt->data.frame.sz, NULL, 0))

From e8fa7a038b2a536eb49bdaf53a24e54f20044e7b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 19 Apr 2023 18:58:59 -0700
Subject: [PATCH 680/926] libs.mk: quote $(LIBVPX_TEST_DATA_PATH)

This allows the testdata target to work environments like cygwin/msys
when a windows style path is used. It may also fix using paths with
spaces, though that's not generally recommended.

Change-Id: Id444c14468b05d589bce49c1f612aa712a3f0c8c
---
 libs.mk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs.mk b/libs.mk
index 92cf5509fb..1411fee9a1 100644
--- a/libs.mk
+++ b/libs.mk
@@ -545,7 +545,7 @@ testdata: $(LIBVPX_TEST_DATA)
             echo "Checking test data:";\
             for f in $(call enabled,LIBVPX_TEST_DATA); do\
                 grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                    (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
+                    (cd "$(LIBVPX_TEST_DATA_PATH)"; $${sha1sum} -c);\
             done; \
         else\
             echo "Skipping test data integrity check, sha1sum not found.";\
@@ -764,10 +764,10 @@ TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
 endif
 utiltest utiltest-no-data-check:
 	$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(TEST_BIN_PATH)
 	$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(TEST_BIN_PATH)
 utiltest: testdata
 else
@@ -791,7 +791,7 @@ EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
 endif
 exampletest exampletest-no-data-check: examples
 	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(EXAMPLES_BIN_PATH)
 exampletest: testdata
 else

From f7d5c3eff865299e0915f9683fd28e325dcb75a9 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 20 Apr 2023 12:17:05 -0700
Subject: [PATCH 681/926] configure: skip arm64_neon.h workaround w/VS >= 2019

Visual Studio 2019+ include arm64_neon.h from arm_neon.h

Bug: b/277255076
Change-Id: I52f42b69a5efe8214a4c541b68e940ad07499584
---
 build/make/configure.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 4bf090f006..32105651ff 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1066,8 +1066,11 @@ EOF
                     enable_feature win_arm64_neon_h_workaround
               else
                 # If a probe is not possible, assume this is the pure Windows
-                # SDK and so the workaround is necessary.
-                enable_feature win_arm64_neon_h_workaround
+                # SDK and so the workaround is necessary when using Visual
+                # Studio < 2019.
+                if [ ${tgt_cc##vs} -lt 16 ]; then
+                  enable_feature win_arm64_neon_h_workaround
+                fi
               fi
             fi
           fi

From f49879a2a3420fd140692f3b23c5c57b6298c954 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 20 Apr 2023 11:06:32 -0400
Subject: [PATCH 682/926] Store tpl stats before propagation

Add two new structs TplBlockStats and TplFrameStats to store tpl stats
before propagation

Change-Id: I903db99326b199ed8f2d8b19ccb973a8c8910501
---
 vp9/encoder/vp9_encoder.c   |  5 ++++-
 vp9/encoder/vp9_encoder.h   | 18 ++++++++++++++++++
 vp9/encoder/vp9_tpl_model.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 354f08eae8..662ec24b83 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2622,7 +2622,10 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 #if CONFIG_NON_GREEDY_MV
   cpi->tpl_ready = 0;
 #endif  // CONFIG_NON_GREEDY_MV
-  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
+    cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+    cpi->tpl_frame_stats[i].block_stats_list = NULL;
+  }
 
   // Allocate memory to store variances for a frame.
   CHECK_MEM_ERROR(cm, cpi->source_diff_var,
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 9e5e64629e..43789864c7 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -324,6 +324,22 @@ typedef struct TplDepFrame {
 #endif
 } TplDepFrame;
 
+// Used to store the stats before propagation.
+typedef struct TplBlockStats {
+  int64_t intra_cost;
+  int64_t inter_cost;
+  int_mv mv;
+  int64_t recrf_rate;
+  int64_t recrf_dist;
+  int ref_frame_index;
+} TplBlockStats;
+
+typedef struct TplFrameStats {
+  int frame_width;
+  int frame_height;
+  TplBlockStats *block_stats_list;
+} TplFrameStats;
+
 #define TPL_DEP_COST_SCALE_LOG2 4
 
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
@@ -743,6 +759,8 @@ typedef struct VP9_COMP {
 
   BLOCK_SIZE tpl_bsize;
   TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
+  // Used to store TPL stats before propagation
+  TplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE];
   YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
   EncFrameBuf enc_frame_buf[REF_FRAMES];
 #if CONFIG_MULTITHREAD
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 53ef356981..c9565e8cab 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -157,9 +157,13 @@ static void init_tpl_stats(VP9_COMP *cpi) {
   int frame_idx;
   for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
     TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    TplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx];
     memset(tpl_frame->tpl_stats_ptr, 0,
            tpl_frame->height * tpl_frame->width *
                sizeof(*tpl_frame->tpl_stats_ptr));
+    memset(tpl_frame_stats->block_stats_list, 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame_stats->block_stats_list));
     tpl_frame->is_valid = 0;
   }
 }
@@ -355,6 +359,27 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
   }
 }
 
+static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
+                                         TplDepStats *tpl_stats, int mi_row,
+                                         int mi_col, BLOCK_SIZE bsize,
+                                         int stride) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplBlockStats *tpl_block_stats_ptr =
+          &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
+      tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
+      tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
+      tpl_block_stats_ptr->mv = src_stats->mv;
+      tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index;
+    }
+  }
+}
+
 static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
                                int mi_row, int mi_col, const BLOCK_SIZE bsize) {
   TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
@@ -1062,6 +1087,8 @@ static void build_motion_field(
 static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
                               int frame_idx, BLOCK_SIZE bsize) {
   TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  TplFrameStats *tpl_frame_stats_before_propagation =
+      &cpi->tpl_frame_stats[frame_idx];
   YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
   YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
 
@@ -1158,6 +1185,10 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
       tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
                       tpl_frame->stride);
 
+      tpl_store_before_propagation(
+          tpl_frame_stats_before_propagation->block_stats_list,
+          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride);
+
       tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
                        bsize);
     }
@@ -1294,6 +1325,11 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) {
     CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
                     vpx_calloc(mi_rows * mi_cols,
                                sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+    vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
+    CHECK_MEM_ERROR(
+        cm, cpi->tpl_frame_stats[frame].block_stats_list,
+        vpx_calloc(mi_rows * mi_cols,
+                   sizeof(*cpi->tpl_frame_stats[frame].block_stats_list)));
     cpi->tpl_stats[frame].is_valid = 0;
     cpi->tpl_stats[frame].width = mi_cols;
     cpi->tpl_stats[frame].height = mi_rows;
@@ -1324,6 +1360,7 @@ void vp9_free_tpl_buffer(VP9_COMP *cpi) {
 #endif
     vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
     cpi->tpl_stats[frame].is_valid = 0;
+    vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
   }
 }
 

From b27cf67c30eee263d1d41a5953e63c976fd82365 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 20 Apr 2023 14:17:49 -0700
Subject: [PATCH 683/926] register_state_check: clear -Wshadow warning

with --target=x86_64-win64-gcc

Bug: webm:1793
Change-Id: I265533af4e8d05adbe1d66a62b6dcb191ca48747
---
 test/register_state_check.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/register_state_check.h b/test/register_state_check.h
index 0b837dd042..ede86ef52f 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -184,13 +184,13 @@ class RegisterStateCheckMMX {
   uint16_t pre_fpu_env_[14];
 };
 
-#define API_REGISTER_STATE_CHECK(statement)         \
-  do {                                              \
-    {                                               \
-      libvpx_test::RegisterStateCheckMMX reg_check; \
-      ASM_REGISTER_STATE_CHECK(statement);          \
-    }                                               \
-    __asm__ volatile("" ::: "memory");              \
+#define API_REGISTER_STATE_CHECK(statement)             \
+  do {                                                  \
+    {                                                   \
+      libvpx_test::RegisterStateCheckMMX reg_check_mmx; \
+      ASM_REGISTER_STATE_CHECK(statement);              \
+    }                                                   \
+    __asm__ volatile("" ::: "memory");                  \
   } while (false)
 
 }  // namespace libvpx_test

From 3c59378e4eac2d241fba8b26e660318b850e5773 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 20 Apr 2023 15:09:00 -0400
Subject: [PATCH 684/926] Calculate recrf_dist and recrf_rate

Change-Id: I74e74807436b92d729e2ccaab96149780f1f52d9
---
 vp9/encoder/vp9_tpl_model.c | 52 ++++++++++++++++++++++++++++---------
 vp9/encoder/vp9_tpl_model.h |  2 ++
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 0f9df78462..81c319c9fd 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -362,7 +362,8 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
 static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
                                          TplDepStats *tpl_stats, int mi_row,
                                          int mi_col, BLOCK_SIZE bsize,
-                                         int stride) {
+                                         int stride, int64_t recon_error,
+                                         int64_t rate_cost) {
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
@@ -374,6 +375,8 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
           &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
       tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
       tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
+      tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->recrf_rate = rate_cost;
       tpl_block_stats_ptr->mv = src_stats->mv;
       tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index;
     }
@@ -455,12 +458,11 @@ static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
 static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                TX_SIZE tx_size, int64_t *recon_error,
-                               int64_t *sse) {
+                               int64_t *sse, uint16_t *eob) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
-  uint16_t eob;
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
@@ -470,16 +472,16 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
-                                 qcoeff, dqcoeff, pd->dequant, &eob,
+                                 qcoeff, dqcoeff, pd->dequant, eob,
                                  scan_order->scan, scan_order->iscan);
   } else {
     vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                          dqcoeff, pd->dequant, &eob, scan_order->scan,
+                          dqcoeff, pd->dequant, eob, scan_order->scan,
                           scan_order->iscan);
   }
 #else
   vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                        dqcoeff, pd->dequant, &eob, scan_order->scan,
+                        dqcoeff, pd->dequant, eob, scan_order->scan,
                         scan_order->iscan);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -523,6 +525,19 @@ static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
       ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
 }
 
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT];
+  int rate_cost = 1;
+  int idx;
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+  for (idx = 0; idx < eob; ++idx) {
+    unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+  }
+
+  return (rate_cost << VP9_PROB_COST_SHIFT);
+}
+
 static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                             struct scale_factors *sf, GF_PICTURE *gf_picture,
                             int frame_idx, TplDepFrame *tpl_frame,
@@ -530,7 +545,8 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                             tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
                             int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
                             YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
-                            int64_t *recon_error, int64_t *sse) {
+                            int64_t *recon_error, int64_t *rate_cost,
+                            int64_t *sse) {
   VP9_COMMON *cm = &cpi->common;
   ThreadData *td = &cpi->td;
 
@@ -553,6 +569,7 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   TplDepStats *tpl_stats =
       &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+  uint16_t eob = 0;
 
   xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
   xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
@@ -606,6 +623,8 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
 
   for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
     int_mv mv;
+    int64_t this_recon_error = 0;
+    int64_t this_rate = 0;
 #if CONFIG_NON_GREEDY_MV
     MotionField *motion_field;
 #endif
@@ -657,12 +676,17 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
     inter_cost = vpx_satd(coeff, pix_num);
 #endif
 
+    get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &this_recon_error,
+                       sse, &eob);
+
+    this_rate = rate_estimator(qcoeff, eob, tx_size);
+    *rate_cost += this_rate;
+    *recon_error += this_recon_error;
+
     if (inter_cost < best_inter_cost) {
       best_rf_idx = rf_idx;
       best_inter_cost = inter_cost;
       best_mv.as_int = mv.as_int;
-      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
-                         sse);
     }
   }
   best_intra_cost = VPXMAX(best_intra_cost, 1);
@@ -1115,7 +1139,6 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
   const TX_SIZE tx_size = max_txsize_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int64_t recon_error, sse;
 
   // Setup scaling factor
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1178,16 +1201,21 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
 
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      int64_t recon_error = 0;
+      int64_t rate_cost = 0;
+      int64_t sse = 0;
       mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
                       src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
-                      tx_size, ref_frame, predictor, &recon_error, &sse);
+                      tx_size, ref_frame, predictor, &recon_error, &rate_cost,
+                      &sse);
       // Motion flow dependency dispenser.
       tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
                       tpl_frame->stride);
 
       tpl_store_before_propagation(
           tpl_frame_stats_before_propagation->block_stats_list,
-          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride);
+          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
+          recon_error, rate_cost);
 
       tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
                        bsize);
diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h
index 86a7734f82..04beb22610 100644
--- a/vp9/encoder/vp9_tpl_model.h
+++ b/vp9/encoder/vp9_tpl_model.h
@@ -20,6 +20,8 @@ extern "C" {
 #endif
 #define log2f(x) (log(x) / (float)M_LOG2_E)
 
+#define TPL_DEP_COST_SCALE_LOG2 4
+
 typedef struct GF_PICTURE {
   YV12_BUFFER_CONFIG *frame;
   int ref_frame[3];

From a425371ccdfc5a6faf17af216e16c2ad2ccb4d05 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 21 Apr 2023 18:10:46 +0000
Subject: [PATCH 685/926] Revert "Calculate recrf_dist and recrf_rate"

This reverts commit 3c59378e4eac2d241fba8b26e660318b850e5773.

Reason for revert:

recon_error and recon_rate is summed by mistake across reference frames, as pointed out by Angie.

It could also cause vp9 behavior changes.

Original change's description:
> Calculate recrf_dist and recrf_rate
>
> Change-Id: I74e74807436b92d729e2ccaab96149780f1f52d9

Change-Id: I6106ce77cb0fe8c12b2bcf070d01513ffa8dc613
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
---
 vp9/encoder/vp9_tpl_model.c | 52 +++++++++----------------------------
 vp9/encoder/vp9_tpl_model.h |  2 --
 2 files changed, 12 insertions(+), 42 deletions(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 81c319c9fd..0f9df78462 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -362,8 +362,7 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
 static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
                                          TplDepStats *tpl_stats, int mi_row,
                                          int mi_col, BLOCK_SIZE bsize,
-                                         int stride, int64_t recon_error,
-                                         int64_t rate_cost) {
+                                         int stride) {
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
@@ -375,8 +374,6 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
           &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
       tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
       tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
-      tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
-      tpl_block_stats_ptr->recrf_rate = rate_cost;
       tpl_block_stats_ptr->mv = src_stats->mv;
       tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index;
     }
@@ -458,11 +455,12 @@ static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
 static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                TX_SIZE tx_size, int64_t *recon_error,
-                               int64_t *sse, uint16_t *eob) {
+                               int64_t *sse) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+  uint16_t eob;
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
@@ -472,16 +470,16 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
-                                 qcoeff, dqcoeff, pd->dequant, eob,
+                                 qcoeff, dqcoeff, pd->dequant, &eob,
                                  scan_order->scan, scan_order->iscan);
   } else {
     vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                          dqcoeff, pd->dequant, eob, scan_order->scan,
+                          dqcoeff, pd->dequant, &eob, scan_order->scan,
                           scan_order->iscan);
   }
 #else
   vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                        dqcoeff, pd->dequant, eob, scan_order->scan,
+                        dqcoeff, pd->dequant, &eob, scan_order->scan,
                         scan_order->iscan);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -525,19 +523,6 @@ static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
       ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
 }
 
-static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
-  const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT];
-  int rate_cost = 1;
-  int idx;
-  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
-  for (idx = 0; idx < eob; ++idx) {
-    unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
-    rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
-  }
-
-  return (rate_cost << VP9_PROB_COST_SHIFT);
-}
-
 static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                             struct scale_factors *sf, GF_PICTURE *gf_picture,
                             int frame_idx, TplDepFrame *tpl_frame,
@@ -545,8 +530,7 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                             tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
                             int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
                             YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
-                            int64_t *recon_error, int64_t *rate_cost,
-                            int64_t *sse) {
+                            int64_t *recon_error, int64_t *sse) {
   VP9_COMMON *cm = &cpi->common;
   ThreadData *td = &cpi->td;
 
@@ -569,7 +553,6 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   TplDepStats *tpl_stats =
       &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-  uint16_t eob = 0;
 
   xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
   xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
@@ -623,8 +606,6 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
 
   for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
     int_mv mv;
-    int64_t this_recon_error = 0;
-    int64_t this_rate = 0;
 #if CONFIG_NON_GREEDY_MV
     MotionField *motion_field;
 #endif
@@ -676,17 +657,12 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
     inter_cost = vpx_satd(coeff, pix_num);
 #endif
 
-    get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &this_recon_error,
-                       sse, &eob);
-
-    this_rate = rate_estimator(qcoeff, eob, tx_size);
-    *rate_cost += this_rate;
-    *recon_error += this_recon_error;
-
     if (inter_cost < best_inter_cost) {
       best_rf_idx = rf_idx;
       best_inter_cost = inter_cost;
       best_mv.as_int = mv.as_int;
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+                         sse);
     }
   }
   best_intra_cost = VPXMAX(best_intra_cost, 1);
@@ -1139,6 +1115,7 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
   const TX_SIZE tx_size = max_txsize_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int64_t recon_error, sse;
 
   // Setup scaling factor
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1201,21 +1178,16 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
 
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-      int64_t recon_error = 0;
-      int64_t rate_cost = 0;
-      int64_t sse = 0;
       mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
                       src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
-                      tx_size, ref_frame, predictor, &recon_error, &rate_cost,
-                      &sse);
+                      tx_size, ref_frame, predictor, &recon_error, &sse);
       // Motion flow dependency dispenser.
       tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
                       tpl_frame->stride);
 
       tpl_store_before_propagation(
           tpl_frame_stats_before_propagation->block_stats_list,
-          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
-          recon_error, rate_cost);
+          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride);
 
       tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
                        bsize);
diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h
index 04beb22610..86a7734f82 100644
--- a/vp9/encoder/vp9_tpl_model.h
+++ b/vp9/encoder/vp9_tpl_model.h
@@ -20,8 +20,6 @@ extern "C" {
 #endif
 #define log2f(x) (log(x) / (float)M_LOG2_E)
 
-#define TPL_DEP_COST_SCALE_LOG2 4
-
 typedef struct GF_PICTURE {
   YV12_BUFFER_CONFIG *frame;
   int ref_frame[3];

From ec2a75ce9c92798d0238575150e337a6f024fe3e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 21 Apr 2023 13:03:34 -0700
Subject: [PATCH 686/926] vp9_highbd_iht16x16_add_neon: clear -Wshadow warning

Bug: webm:1793
Change-Id: I4e79a4d7d41b6abf88e3e60c54ab48a92b0346d2
---
 vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
index 219ff63cb8..aeb7e49c10 100644
--- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
@@ -64,9 +64,9 @@ highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) {
 
 #define highbd_iadst_half_butterfly(in, c, lane, out) \
   do {                                                \
-    int64x2x2_t t[2];                                 \
-    vmull_lane_s32_dual(in, c, lane, t);              \
-    out = highbd_dct_const_round_shift_low_8(t);      \
+    int64x2x2_t _t[2];                                \
+    vmull_lane_s32_dual(in, c, lane, _t);             \
+    out = highbd_dct_const_round_shift_low_8(_t);     \
   } while (0)
 
 #define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \

From fed3de997ca639db0fd8a2a40b20300b16878292 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 21 Apr 2023 13:03:58 -0700
Subject: [PATCH 687/926] highbd_vpx_convolve8_neon: clear -Wshadow warning

Bug: webm:1793
Change-Id: If1a46fe183cd18e05b5538b1eba098e420b745ec
---
 vpx_dsp/arm/highbd_vpx_convolve8_neon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index c46c016312..47684473ca 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -355,7 +355,6 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
   } else {
     const int16x8_t filters = vld1q_s16(filter[x0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    uint16x8_t t0, t1, t2, t3;
 
     assert(!((intptr_t)dst & 3));
     assert(!(dst_stride & 3));
@@ -365,6 +364,7 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
     if (h == 4) {
       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
       int32x4_t d0, d1, d2, d3;
+      uint16x8_t t0, t1, t2, t3;
       uint16x8_t d01, d23, t01, t23;
 
       __builtin_prefetch(src + 0 * src_stride);

From 24802201acd7dfa15928bcc47c1e270e7db5afac Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 20 Apr 2023 15:09:00 -0400
Subject: [PATCH 688/926] Reland "Calculate recrf_dist and recrf_rate"

This is a reland of commit 3c59378e4eac2d241fba8b26e660318b850e5773

Addressed issues from the previous CL:

- Both recon_error and rate_cost are scaled up
- recon_error and rate_cost are not accumulated across ref frames,
  instead they are calculated with the best ref frame picked.
- get_quantize_error() is put where it was, so there is no behavior
  change for vp9.

Bug: b/273736974

Original change's description:
> Calculate recrf_dist and recrf_rate
>
> Change-Id: I74e74807436b92d729e2ccaab96149780f1f52d9

Change-Id: I20e1f5543e83b576a074bd4e6b44d99da65f4b56
---
 vp9/encoder/vp9_tpl_model.c | 46 ++++++++++++++++++++++++++++---------
 vp9/encoder/vp9_tpl_model.h |  2 ++
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 0f9df78462..d6ce480c89 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -362,7 +362,8 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
 static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
                                          TplDepStats *tpl_stats, int mi_row,
                                          int mi_col, BLOCK_SIZE bsize,
-                                         int stride) {
+                                         int stride, int64_t recon_error,
+                                         int64_t rate_cost) {
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
@@ -374,6 +375,8 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
           &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
       tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
       tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
+      tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
       tpl_block_stats_ptr->mv = src_stats->mv;
       tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index;
     }
@@ -455,12 +458,11 @@ static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
 static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                TX_SIZE tx_size, int64_t *recon_error,
-                               int64_t *sse) {
+                               int64_t *sse, uint16_t *eob) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
-  uint16_t eob;
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
@@ -470,16 +472,16 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
-                                 qcoeff, dqcoeff, pd->dequant, &eob,
+                                 qcoeff, dqcoeff, pd->dequant, eob,
                                  scan_order->scan, scan_order->iscan);
   } else {
     vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                          dqcoeff, pd->dequant, &eob, scan_order->scan,
+                          dqcoeff, pd->dequant, eob, scan_order->scan,
                           scan_order->iscan);
   }
 #else
   vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                        dqcoeff, pd->dequant, &eob, scan_order->scan,
+                        dqcoeff, pd->dequant, eob, scan_order->scan,
                         scan_order->iscan);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -523,6 +525,19 @@ static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
       ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
 }
 
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT];
+  int rate_cost = 1;
+  int idx;
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+  for (idx = 0; idx < eob; ++idx) {
+    unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+  }
+
+  return (rate_cost << VP9_PROB_COST_SHIFT);
+}
+
 static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                             struct scale_factors *sf, GF_PICTURE *gf_picture,
                             int frame_idx, TplDepFrame *tpl_frame,
@@ -530,7 +545,8 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                             tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
                             int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
                             YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
-                            int64_t *recon_error, int64_t *sse) {
+                            int64_t *recon_error, int64_t *rate_cost,
+                            int64_t *sse) {
   VP9_COMMON *cm = &cpi->common;
   ThreadData *td = &cpi->td;
 
@@ -658,11 +674,15 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
 #endif
 
     if (inter_cost < best_inter_cost) {
+      uint16_t eob = 0;
       best_rf_idx = rf_idx;
       best_inter_cost = inter_cost;
       best_mv.as_int = mv.as_int;
+      // Since best_inter_cost is initialized as INT64_MAX, recon_error and
+      // rate_cost will be calculated with the best reference frame.
       get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
-                         sse);
+                         sse, &eob);
+      *rate_cost = rate_estimator(qcoeff, eob, tx_size);
     }
   }
   best_intra_cost = VPXMAX(best_intra_cost, 1);
@@ -1115,7 +1135,6 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
   const TX_SIZE tx_size = max_txsize_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int64_t recon_error, sse;
 
   // Setup scaling factor
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1178,16 +1197,21 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
 
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      int64_t recon_error = 0;
+      int64_t rate_cost = 0;
+      int64_t sse = 0;
       mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
                       src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
-                      tx_size, ref_frame, predictor, &recon_error, &sse);
+                      tx_size, ref_frame, predictor, &recon_error, &rate_cost,
+                      &sse);
       // Motion flow dependency dispenser.
       tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
                       tpl_frame->stride);
 
       tpl_store_before_propagation(
           tpl_frame_stats_before_propagation->block_stats_list,
-          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride);
+          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
+          recon_error, rate_cost);
 
       tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
                        bsize);
diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h
index 86a7734f82..04beb22610 100644
--- a/vp9/encoder/vp9_tpl_model.h
+++ b/vp9/encoder/vp9_tpl_model.h
@@ -20,6 +20,8 @@ extern "C" {
 #endif
 #define log2f(x) (log(x) / (float)M_LOG2_E)
 
+#define TPL_DEP_COST_SCALE_LOG2 4
+
 typedef struct GF_PICTURE {
   YV12_BUFFER_CONFIG *frame;
   int ref_frame[3];

From e7b58b69fd91a4288453c7c7003e1fc4cc48bb93 Mon Sep 17 00:00:00 2001
From: Neeraj Gadgil <neeraj.gadgil@ittiam.com>
Date: Wed, 19 Apr 2023 08:13:26 +0530
Subject: [PATCH 689/926] Reduce joint motion search iters based on bsize

Joint motion search during compound mode eval is optimized by
reducing the number of mv search iterations based on bsize.
The sf 'comp_inter_joint_search_thresh' is renamed as
'comp_inter_joint_search_iter_level' and used to add the logic.

cpu  Testset  Instr. Cnt     BD Rate loss (%)
               Red (%)   avg. psnr  ovr.psnr    ssim
 0   LOWRES2    5.373     0.0917     0.1088    0.0294
 0   MIDRES2    3.395     0.0239     0.0520    0.0783
 0    HDRES2    2.291     0.0223     0.0301    0.0053
 0   Average    3.686     0.0460     0.0636    0.0377

STATS_CHANGED

Change-Id: I7ee8873ebc8af967382324ae8f5c70c26665d5e6
---
 vp9/encoder/vp9_rdopt.c          | 40 +++++++++++++++++++++++++-------
 vp9/encoder/vp9_speed_features.c |  7 +++---
 vp9/encoder/vp9_speed_features.h | 19 +++++++++++----
 3 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c68cfefdea..f051c62791 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1898,11 +1898,22 @@ static INLINE int skip_single_mode_based_on_mode_rate(
   return 0;
 }
 
-#define NUM_ITERS 4
+#define MAX_JOINT_MV_SEARCH_ITERS 4
+static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) {
+  int num_iters = MAX_JOINT_MV_SEARCH_ITERS;  // sf_level = 0
+  if (sf_level >= 2)
+    num_iters = 0;
+  else if (sf_level >= 1)
+    num_iters = bsize < BLOCK_8X8
+                    ? 0
+                    : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS);
+  return num_iters;
+}
+
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                 int_mv *frame_mv, int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
+                                int *rate_mv, int num_iters) {
   const VP9_COMMON *const cm = &cpi->common;
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -1911,7 +1922,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   const int refs[2] = { mi->ref_frame[0],
                         mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] };
   int_mv ref_mv[2];
-  int_mv iter_mvs[NUM_ITERS][2];
+  int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2];
   int ite, ref;
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
   struct scale_factors sf;
@@ -1932,6 +1943,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  // Check number of iterations do not exceed the max
+  assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS);
+
   for (ref = 0; ref < 2; ++ref) {
     ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
 
@@ -1962,7 +1976,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
-  for (ite = 0; ite < NUM_ITERS; ite++) {
+  for (ite = 0; ite < num_iters; ite++) {
     struct buf_2d ref_yv12[2];
     uint32_t bestsme = UINT_MAX;
     int sadpb = x->sadperbit16;
@@ -2044,7 +2058,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     } else {
       break;
     }
-    if (ite < NUM_ITERS - 1) {
+    if (ite < num_iters - 1) {
       iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int;
       iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int;
     }
@@ -2250,12 +2264,16 @@ static int64_t rd_pick_best_sub8x8_mode(
 
         if (has_second_rf && this_mode == NEWMV &&
             mi->interp_filter == EIGHTTAP) {
+          // Decide number of joint motion search iterations
+          const int num_joint_search_iters = get_joint_search_iters(
+              cpi->sf.comp_inter_joint_search_iter_level, bsize);
           // adjust src pointers
           mi_buf_shift(x, block);
-          if (sf->comp_inter_joint_search_thresh <= bsize) {
+          if (num_joint_search_iters) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
-                                mi_col, seg_mvs[block], &rate_mv);
+                                mi_col, seg_mvs[block], &rate_mv,
+                                num_joint_search_iters);
             seg_mvs[block][mi->ref_frame[0]].as_int =
                 frame_mv[this_mode][mi->ref_frame[0]].as_int;
             seg_mvs[block][mi->ref_frame[1]].as_int =
@@ -2878,16 +2896,20 @@ static int64_t handle_inter_mode(
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
+      // Decide number of joint motion search iterations
+      const int num_joint_search_iters = get_joint_search_iters(
+          cpi->sf.comp_inter_joint_search_iter_level, bsize);
+
       // Initialize mv using single prediction mode result.
       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+      if (num_joint_search_iters) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
         start_timing(cpi, joint_motion_search_time);
 #endif
         joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
-                            single_newmv, &rate_mv);
+                            single_newmv, &rate_mv, num_joint_search_iters);
 #if CONFIG_COLLECT_COMPONENT_TIMING
         end_timing(cpi, joint_motion_search_time);
 #endif
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 04804da1ca..60720e3ea6 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -244,6 +244,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0;
 
   sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+  sf->comp_inter_joint_search_iter_level = 1;
 
   // Reference masking is not supported in dynamic scaling mode.
   sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC;
@@ -331,7 +332,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
             : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
                   FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_iter_level = 2;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->recode_tolerance_high = 45;
     sf->enhanced_full_pixel_motion_search = 0;
@@ -530,7 +531,7 @@ static void set_rt_speed_feature_framesize_independent(
     }
 
     sf->disable_filter_search_var_thresh = 50;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_iter_level = 2;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
@@ -928,7 +929,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
   sf->mv.use_downsampled_sad = 0;
-  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->comp_inter_joint_search_iter_level = 0;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 7cb3f3527d..70c61fe00d 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -286,11 +286,20 @@ typedef struct SPEED_FEATURES {
   // adds overhead.
   int static_segmentation;
 
-  // If 1 we iterate finding a best reference for 2 ref frames together - via
-  // a log search that iterates 4 times (check around mv for last for best
-  // error of combined predictor then check around mv for alt). If 0 we
-  // we just use the best motion vector found for each frame by itself.
-  BLOCK_SIZE comp_inter_joint_search_thresh;
+  // The best compound predictor is found using an iterative log search process
+  // that searches for best ref0 mv using error of combined predictor and then
+  // searches for best ref1 mv. This sf determines the number of iterations of
+  // this process based on block size. The sf becomes more aggressive from level
+  // 0 to 2. The following table indicates the number of iterations w.r.t bsize:
+  //  -----------------------------------------------
+  // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 |
+  // |    0     |     4     |      4       |    4    |
+  // |    1     |     0     |      2       |    4    |
+  // |    2     |     0     |      0       |    0    |
+  //  -----------------------------------------------
+  // Here, 0 iterations indicate using the best single motion vector selected
+  // for each ref frame without any iterative refinement.
+  int comp_inter_joint_search_iter_level;
 
   // This variable is used to cap the maximum number of times we skip testing a
   // mode to be evaluated. A high value means we will be faster.

From dbb1e8c7a6eafcd209a56c84b6f0111e74ec3ae5 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 27 Apr 2023 15:58:08 -0400
Subject: [PATCH 690/926] Clean up a stale TODO in tpl

Change-Id: Ieccaff1cc94cbb2c5a294d83f3080f7407267016
---
 vp9/encoder/vp9_tpl_model.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index d6ce480c89..1dff8d3fd2 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -40,9 +40,6 @@ static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
   memset(recon_frame_index, -1, sizeof(recon_frame_index));
   stack_init(arf_index_stack, MAX_ARF_LAYERS);
 
-  // TODO(jingning): To be used later for gf frame type parsing.
-  (void)gf_group;
-
   for (i = 0; i < FRAME_BUFFERS; ++i) {
     if (frame_bufs[i].ref_count == 0) {
       alloc_frame_mvs(cm, i);

From 84a180fe858fd6de9c301cd884e2f1ff341781b3 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 27 Apr 2023 16:15:56 -0400
Subject: [PATCH 691/926] Move TplFrameStats to public header

Get ready for changes to follow:

- Custom reader/writer IO functions
- Codec control to get TPL stats from the encoder

Move the definition of TplFrameStats to public header so applications
can use them directly.

Bug: b/273736974
Change-Id: Ieb0db4560ddd966df1bc01f6a7e179cc97f9bac1
---
 vp9/encoder/vp9_encoder.h   | 16 ----------------
 vp9/encoder/vp9_tpl_model.c |  3 ++-
 vpx/vpx_encoder.h           | 19 +++++++++++++++++++
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 43789864c7..7c22c807b7 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -324,22 +324,6 @@ typedef struct TplDepFrame {
 #endif
 } TplDepFrame;
 
-// Used to store the stats before propagation.
-typedef struct TplBlockStats {
-  int64_t intra_cost;
-  int64_t inter_cost;
-  int_mv mv;
-  int64_t recrf_rate;
-  int64_t recrf_dist;
-  int ref_frame_index;
-} TplBlockStats;
-
-typedef struct TplFrameStats {
-  int frame_width;
-  int frame_height;
-  TplBlockStats *block_stats_list;
-} TplFrameStats;
-
 #define TPL_DEP_COST_SCALE_LOG2 4
 
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 1dff8d3fd2..dbd7482b0d 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -374,7 +374,8 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
       tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
       tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
       tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
-      tpl_block_stats_ptr->mv = src_stats->mv;
+      tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
+      tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
       tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index;
     }
   }
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index a0d2c87558..9247231328 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -252,6 +252,25 @@ enum vpx_kf_mode {
   VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
 };
 
+/*!\brief Temporal dependency model stats for each block before propagation */
+typedef struct TplBlockStats {
+  int64_t intra_cost;  /**< Intra cost */
+  int64_t inter_cost;  /**< Inter cost */
+  int16_t mv_r;        /**< Motion vector row */
+  int16_t mv_c;        /**< Motion vector col */
+  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
+  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
+  int ref_frame_index; /**< Ref frame index */
+} TplBlockStats;
+
+/*!\brief Temporal dependency model stats for each frame before propagation */
+typedef struct TplFrameStats {
+  int frame_width;  /**< Frame width */
+  int frame_height; /**< Frame height */
+  // Size of the list can be calculated from frame_width and frame_height.
+  TplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} TplFrameStats;
+
 /*!\brief Encoded Frame Flags
  *
  * This type indicates a bitfield to be passed to vpx_codec_encode(), defining

From 33aba6ecc16c71b6f380cfd3d24eab11690deb99 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 20 Apr 2023 18:15:23 -0700
Subject: [PATCH 692/926] configure: add aarch64 to ARCH_LIST

This will allow identifying Windows Visual Studio targets as aarch64;
the Microsoft compiler does not define __aarch64__.

An alternative would be to define this in the code, checking for
_M_ARM64 or _M_ARM64EC. For now we'll use the existing VPX_ARCH_*
system. For compatibility VPX_ARCH_ARM will continue to be defined to 1
in this case.

Bug: webm:1788
Bug: b/277255076
Change-Id: I12e25710891e86f0c7339ba96884c18ed90ba16f
---
 build/make/configure.sh | 4 ++++
 configure               | 1 +
 2 files changed, 5 insertions(+)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 32105651ff..ec9af5e63d 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -842,6 +842,10 @@ process_common_toolchain() {
 
   # Enable the architecture family
   case ${tgt_isa} in
+    arm64 | armv8)
+      enable_feature arm
+      enable_feature aarch64
+      ;;
     arm*)
       enable_feature arm
       ;;
diff --git a/configure b/configure
index 890ad3968a..20707727ef 100755
--- a/configure
+++ b/configure
@@ -243,6 +243,7 @@ CODEC_FAMILIES="
 
 ARCH_LIST="
     arm
+    aarch64
     mips
     x86
     x86_64

From 57b9afa58f849a8165ce3132c21087ae451d862c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 2 May 2023 18:37:59 -0700
Subject: [PATCH 693/926] s/__aarch64__/VPX_ARCH_AARCH64/

This allows AArch64 to be correctly detected when building with Visual
Studio (cl.exe) and fixes a crash in vp9_diamond_search_sad_neon.c.
There are still test failures, however.

Microsoft's compiler doesn't define __ARM_FEATURE_*. To use those paths
we may need to rely on _M_ARM64_EXTENSION.

Bug: webm:1788
Bug: b/277255076
Change-Id: I4d26f5f84dbd0cbcd1cdf0d7d932ebcf109febe5
---
 vp8/encoder/arm/neon/fastquantizeb_neon.c     |  8 ++---
 vp9/encoder/arm/neon/vp9_denoiser_neon.c      |  2 +-
 .../arm/neon/vp9_diamond_search_sad_neon.c    | 14 ++++----
 vp9/encoder/arm/neon/vp9_quantize_neon.c      |  6 ++--
 vpx_dsp/arm/avg_neon.c                        |  2 +-
 vpx_dsp/arm/highbd_avg_neon.c                 |  2 +-
 vpx_dsp/arm/highbd_quantize_neon.c            |  8 ++---
 vpx_dsp/arm/quantize_neon.c                   |  8 ++---
 vpx_dsp/arm/sum_neon.h                        | 34 +++++++++----------
 vpx_dsp/arm/transpose_neon.h                  | 10 +++---
 vpx_dsp/arm/vpx_convolve8_neon.c              |  6 ++--
 vpx_dsp/arm/vpx_convolve8_neon.h              |  8 ++---
 12 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c
index 6fc60805f6..950c943343 100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -28,11 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
   int16x8_t x0, x1, sz0, sz1, y0, y1;
   uint16x8_t eob0, eob1;
-#ifndef __aarch64__
+#if !VPX_ARCH_AARCH64
   uint16x4_t eob_d16;
   uint32x2_t eob_d32;
   uint32x4_t eob_q32;
-#endif  // __arch64__
+#endif  // !VPX_ARCH_AARCH64
 
   /* sign of z: z >> 15 */
   sz0 = vshrq_n_s16(z0, 15);
@@ -70,7 +70,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
 
   /* select the largest value */
   eob0 = vmaxq_u16(eob0, eob1);
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *d->eob = (int8_t)vmaxvq_u16(eob0);
 #else
   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
@@ -79,7 +79,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
   eob_d32 = vpmax_u32(eob_d32, eob_d32);
 
   vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 
   /* qcoeff = x */
   vst1q_s16(d->qcoeff, x0);
diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
index 53e8c7e498..d631cd437d 100644
--- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -21,7 +21,7 @@
 
 // Compute the sum of all pixel differences of this MB.
 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_s8(v_sum_diff_total);
 #else
   const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 255e6fbc4a..b82b3f9db5 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -94,7 +94,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
   // Work out the start point for the search
   const uint8_t *best_address = in_what;
   const uint8_t *new_best_address = best_address;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
 #else
   int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
@@ -117,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       int8x16_t v_inside_d;
       uint32x4_t v_outside_d;
       int32x4_t v_cost_d, v_sad_d;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
       int64x2_t v_blocka[2];
 #else
       int32x4_t v_blocka[1];
@@ -138,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
                     vreinterpretq_s32_s16(v_these_mv_w)));
 
       // If none of them are inside, then move on
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
       horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
 #else
       horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
@@ -167,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
 
       // Compute the SIMD pointer offsets.
       {
-#if defined(__aarch64__)  //  sizeof(intptr_t) == 8
+#if VPX_ARCH_AARCH64  //  sizeof(intptr_t) == 8
         // Load the offsets
         int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
         int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
@@ -234,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       // Find the minimum value and index horizontally in v_sad_d
       {
         uint32_t local_best_sad;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
         local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
 #else
         uint32x2_t horiz_min_0 =
@@ -256,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
           uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
           v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
 
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
           local_best_idx = vminvq_u32(v_mask_d);
 #else
           horiz_min_0 =
@@ -280,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
     best_address = new_best_address;
 
     v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
     v_ba_q = vdupq_n_s64((intptr_t)best_address);
 #else
     v_ba_d = vdupq_n_s32((intptr_t)best_address);
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index c2b55fcbaa..97ab13628e 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -50,7 +50,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
 }
 
 static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   return (uint16_t)vmaxvq_s16(v_eobmax);
 #else
   const int16x4_t v_eobmax_3210 =
@@ -65,7 +65,7 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
 
   return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
 
 static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
@@ -81,7 +81,7 @@ static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
 static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
                                               int16x8_t *v_quant,
                                               int16x8_t *v_dequant) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *v_round = vdupq_laneq_s16(*v_round, 1);
   *v_quant = vdupq_laneq_s16(*v_quant, 1);
   *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index d48115dd01..8c61fc26f4 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -210,7 +210,7 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
   const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
 
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   *min = *max = 0;  // Clear high bits
   *((uint8_t *)max) = vmaxvq_u8(ab07_max);
   *((uint8_t *)min) = vminvq_u8(ab07_min);
diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
index fc10197d71..8939ee131e 100644
--- a/vpx_dsp/arm/highbd_avg_neon.c
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -114,7 +114,7 @@ void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride,
   const uint16x8_t min4567 = vminq_u16(min45, min67);
   const uint16x8_t min07 = vminq_u16(min0123, min4567);
 
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   *min = *max = 0;  // Clear high bits
   *((uint16_t *)max) = vmaxvq_u16(max07);
   *((uint16_t *)min) = vminvq_u16(min07);
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 526447acf5..d2a7add60d 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -166,7 +166,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     } while (n_coeffs > 0);
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -176,7 +176,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
   (void)n_coeffs;
@@ -291,7 +291,7 @@ void vpx_highbd_quantize_b_32x32_neon(
     }
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -301,5 +301,5 @@ void vpx_highbd_quantize_b_32x32_neon(
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index cc8f623744..35c67f6075 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -134,7 +134,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     } while (n_coeffs > 0);
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -144,7 +144,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
   (void)scan;
@@ -276,7 +276,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
     }
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -286,5 +286,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index a0c72f92ce..48a2fc05ca 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -17,7 +17,7 @@
 #include "vpx/vpx_integer.h"
 
 static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlv_u8(a);
 #else
   const uint16x4_t b = vpaddl_u8(a);
@@ -27,7 +27,7 @@ static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
 }
 
 static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlv_u8(a);
 #else
   const uint16x4_t b = vpaddl_u8(a);
@@ -38,7 +38,7 @@ static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
 }
 
 static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u8(a);
 #else
   const uint16x8_t b = vpaddlq_u8(a);
@@ -50,7 +50,7 @@ static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
 }
 
 static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddv_u16(a);
 #else
   const uint16x4_t b = vpadd_u16(a, a);
@@ -60,7 +60,7 @@ static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
 }
 
 static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_s16(a);
 #else
   const int32x4_t b = vpaddlq_s16(a);
@@ -72,7 +72,7 @@ static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u16(a);
 #else
   const uint32x4_t b = vpaddlq_u16(a);
@@ -84,7 +84,7 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
 }
 
 static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
   const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
   const uint16x8_t b0 = vpaddq_u16(a0, a1);
@@ -102,7 +102,7 @@ static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
 
 static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
                                                     const uint16x8_t vec_hi) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
 #else
   const uint32x4_t vec_l_lo =
@@ -127,7 +127,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
   const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
   const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
   const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   const uint32x4_t c0 = vpaddq_u32(b0, b1);
   const uint32x4_t c1 = vpaddq_u32(b2, b3);
   return vpaddq_u32(c0, c1);
@@ -143,7 +143,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
 }
 
 static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddv_s32(a);
 #else
   return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
@@ -151,7 +151,7 @@ static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddv_u32(a);
 #else
   return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
@@ -159,7 +159,7 @@ static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
 }
 
 static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_s32(a);
 #else
   const int64x2_t b = vpaddlq_s32(a);
@@ -170,7 +170,7 @@ static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_u32(a);
 #else
   const uint64x2_t b = vpaddlq_u32(a);
@@ -181,7 +181,7 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
 }
 
 static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
   uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
   return vpaddq_u32(res01, res23);
@@ -196,7 +196,7 @@ static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
 }
 
 static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u32(a);
 #else
   const uint64x2_t b = vpaddlq_u32(a);
@@ -205,7 +205,7 @@ static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
 }
 
 static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_s64(a);
 #else
   return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
@@ -213,7 +213,7 @@ static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
 }
 
 static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_u64(a);
 #else
   return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 518278f303..74f85a6bb6 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -23,7 +23,7 @@
 // b0.val[1]: 04 05 06 07 20 21 22 23
 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vreinterpretq_s16_s64(
       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
   b0.val[1] = vreinterpretq_s16_s64(
@@ -39,7 +39,7 @@ static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
 
 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   int32x4x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vreinterpretq_s32_s64(
       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
   b0.val[1] = vreinterpretq_s32_s64(
@@ -53,7 +53,7 @@ static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
 
 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   int64x2x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
   b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
 #else
@@ -67,7 +67,7 @@ static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
 
 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vreinterpretq_u8_u64(
       vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
   b0.val[1] = vreinterpretq_u8_u64(
@@ -83,7 +83,7 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
 
 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
   uint16x8x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vreinterpretq_u16_u64(
       vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
   b0.val[1] = vreinterpretq_u16_u64(
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index b4cdd58c70..b312cc747c 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,7 +31,7 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-#if defined(__aarch64__) && \
+#if VPX_ARCH_AARCH64 && \
     (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
 
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
@@ -1261,7 +1261,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
 
 #endif  // defined(__ARM_FEATURE_MATMUL_INT8)
 
-#else  // !(defined(__aarch64__) &&
+#else  // !(VPX_ARCH_AARCH64 &&
        //   (defined(__ARM_FEATURE_DOTPROD) ||
        //    defined(__ARM_FEATURE_MATMUL_INT8)))
 
@@ -2105,6 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif  // #if defined(__aarch64__) &&
+#endif  // #if VPX_ARCH_AARCH64 &&
         //     (defined(__ARM_FEATURE_DOTPROD) ||
         //      defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index ed7f180538..07cf8242d3 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -16,7 +16,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
@@ -114,9 +114,9 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
   return vqrshrun_n_s16(sum, 7);
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
 static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
@@ -199,7 +199,7 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
   return vqrshrun_n_s16(sum, 7);
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,

From a398b60d6c0253a99711b8496d691cdbdf635b2c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 3 May 2023 10:09:03 -0700
Subject: [PATCH 694/926] fdct8x8_test: EXPECT_* -> ASSERT_*

This avoids unnecessary logging when a block has multiple errors.

Change-Id: If0f3e6f8ff5bd284655f7cabfd23c253c93d44c5
---
 test/fdct8x8_test.cc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 83d1ff1429..fcc84690a0 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -170,7 +170,7 @@ class FwdTrans8x8TestBase {
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = kSignBiasMaxDiff255;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+      ASSERT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-255, 255] at index " << j
@@ -201,7 +201,7 @@ class FwdTrans8x8TestBase {
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = kSignBiasMaxDiff15;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+      ASSERT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-15, 15] at index " << j
@@ -275,11 +275,11 @@ class FwdTrans8x8TestBase {
       }
     }
 
-    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+    ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error)
         << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
         << " roundtrip error > 1";
 
-    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+    ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
         << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
         << "error > 1/5 per block";
   }
@@ -360,17 +360,17 @@ class FwdTrans8x8TestBase {
         total_coeff_error += abs(coeff_diff);
       }
 
-      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+      ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
-          << "an individual roundtrip error > 1";
+          << " an individual roundtrip error > 1";
 
-      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+      ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
           << " roundtrip error > 1/5 per block";
 
-      EXPECT_EQ(0, total_coeff_error)
+      ASSERT_EQ(0, total_coeff_error)
           << "Error: Extremal 8x8 FDCT/FHT has"
-          << "overflow issues in the intermediate steps > 1";
+          << " overflow issues in the intermediate steps > 1";
     }
   }
 
@@ -426,7 +426,7 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+        ASSERT_GE(1u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
@@ -456,7 +456,7 @@ class FwdTrans8x8TestBase {
       for (int j = 0; j < kNumCoeffs; ++j) {
         const int32_t diff = coeff[j] - coeff_r[j];
         const uint32_t error = diff * diff;
-        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+        ASSERT_GE(9u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 DCT has error " << error << " at index " << j;
       }
     }
@@ -512,7 +512,7 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error)
+        ASSERT_EQ(0u, error)
             << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }

From 3dbadd1b83ce83b804f7dc2034fb51870365e337 Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Wed, 3 May 2023 13:28:41 -0700
Subject: [PATCH 695/926] Fix clang warning on const-qualification of
 parameters

Change-Id: I900a0a48dde5fcb262157b191ac536e18269feb3
---
 test/vp9_quantize_test.cc    | 4 ++--
 vpx_dsp/arm/quantize_neon.c  | 4 ++--
 vpx_dsp/quantize.c           | 4 ++--
 vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 ++--
 vpx_dsp/x86/quantize_avx.c   | 4 ++--
 vpx_dsp/x86/quantize_avx2.c  | 4 ++--
 vpx_dsp/x86/quantize_ssse3.c | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 84a5a58e4e..5e3a7c2701 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -39,10 +39,10 @@ namespace {
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             const macroblock_plane *const mb_plane,
+                             const macroblock_plane *mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
-                             const struct ScanOrder *const scan_order);
+                             const struct ScanOrder *scan_order);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index cc8f623744..7232b81703 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -216,10 +216,10 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
-                               const struct macroblock_plane *const mb_plane,
+                               const struct macroblock_plane *mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const struct ScanOrder *const scan_order) {
+                               const struct ScanOrder *scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index d44ced20dc..7dff8c7a87 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -211,10 +211,10 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                            const struct macroblock_plane *const mb_plane,
+                            const struct macroblock_plane *mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const struct ScanOrder *const scan_order) {
+                            const struct ScanOrder *scan_order) {
   const int n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index bde0115298..494d7ba5e2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -725,14 +725,14 @@ ()
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index d289bf6ebf..6837a5cf28 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -143,10 +143,10 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
-                              const struct macroblock_plane *const mb_plane,
+                              const struct macroblock_plane *mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const struct ScanOrder *const scan_order) {
+                              const struct ScanOrder *scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 5421dcf0ba..3d97b3fdae 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -253,10 +253,10 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
 }
 
 void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
-                               const struct macroblock_plane *const mb_plane,
+                               const struct macroblock_plane *mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const struct ScanOrder *const scan_order) {
+                               const struct ScanOrder *scan_order) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 556f4ca617..641f23298b 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -110,10 +110,10 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
-                                const struct macroblock_plane *const mb_plane,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const struct ScanOrder *const scan_order) {
+                                const struct ScanOrder *scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index;
   const int16_t *iscan = scan_order->iscan;

From 8782fd070df84baaa1a7eafc2848de5ddd513078 Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Wed, 3 May 2023 13:34:41 -0700
Subject: [PATCH 696/926] Fix mismatched param names in
 vpx_dsp/arm/highbd_avg_neon.c

Change-Id: Ibf00a6e1029284e637b10ef01ac9b31ffadc74ca
---
 vpx_dsp/arm/highbd_avg_neon.c | 59 +++++++++++++++++------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
index fc10197d71..0abdc037be 100644
--- a/vpx_dsp/arm/highbd_avg_neon.c
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -16,18 +16,18 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) {
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
-  const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * a_stride, a_stride);
-  const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * a_stride, a_stride);
+uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p);
+  const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p);
   return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
 }
 
-uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) {
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
   uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
 
-  load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
   sum = vaddq_u16(a0, a1);
   sum = vaddq_u16(sum, a2);
@@ -63,29 +63,28 @@ int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
   return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
 }
 
-void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride, int *min,
-                                int *max) {
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b);
-
-  const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * a_stride);
-  const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * a_stride);
-  const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * a_stride);
-  const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * a_stride);
-  const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * a_stride);
-  const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * a_stride);
-  const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * a_stride);
-  const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * a_stride);
-
-  const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * b_stride);
-  const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * b_stride);
-  const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * b_stride);
-  const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * b_stride);
-  const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * b_stride);
-  const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * b_stride);
-  const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * b_stride);
-  const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * b_stride);
+void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+                                int dp, int *min, int *max) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+  const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+  const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+  const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+  const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+  const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+  const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+  const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+  const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+  const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+  const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+  const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+  const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+  const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+  const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+  const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+  const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
 
   const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
   const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);

From 701392c1b09cd6db8298520caa360f0f7235d85b Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Wed, 3 May 2023 13:38:46 -0700
Subject: [PATCH 697/926] Fix mismatched param names in
 vpx_dsp/arm/sad4d_neon.c

Change-Id: If621944684cf9bb9f353db5961ed8b4b4ae38f24
---
 vpx_dsp/arm/sad4d_neon.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 44cd990280..3a548d0f9f 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -282,11 +282,12 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
-#define SAD_WXH_4D_NEON(w, h)                                                  \
-  void vpx_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
-                                  const uint8_t *const ref[4], int ref_stride, \
-                                  uint32_t res[4]) {                           \
-    sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
+#define SAD_WXH_4D_NEON(w, h)                                                 \
+  void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride,     \
+                                  const uint8_t *const ref_array[4],          \
+                                  int ref_stride, uint32_t sad_array[4]) {    \
+    sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \
+                       (h));                                                  \
   }
 
 SAD_WXH_4D_NEON(4, 4)
@@ -309,16 +310,17 @@ SAD_WXH_4D_NEON(64, 64)
 
 #undef SAD_WXH_4D_NEON
 
-#define SAD_SKIP_WXH_4D_NEON(w, h)                                          \
-  void vpx_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
-                                        const uint8_t *const ref[4],        \
-                                        int ref_stride, uint32_t res[4]) {  \
-    sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res,       \
-                       ((h) >> 1));                                         \
-    res[0] <<= 1;                                                           \
-    res[1] <<= 1;                                                           \
-    res[2] <<= 1;                                                           \
-    res[3] <<= 1;                                                           \
+#define SAD_SKIP_WXH_4D_NEON(w, h)                                         \
+  void vpx_sad_skip_##w##x##h##x4d_neon(                                   \
+      const uint8_t *src_ptr, int src_stride,                              \
+      const uint8_t *const ref_array[4], int ref_stride,                   \
+      uint32_t sad_array[4]) {                                             \
+    sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \
+                       sad_array, ((h) >> 1));                             \
+    sad_array[0] <<= 1;                                                    \
+    sad_array[1] <<= 1;                                                    \
+    sad_array[2] <<= 1;                                                    \
+    sad_array[3] <<= 1;                                                    \
   }
 
 SAD_SKIP_WXH_4D_NEON(4, 4)

From 174e782fe5c0f4b01c161eeb0500c88d84a26c42 Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Wed, 3 May 2023 14:44:08 -0700
Subject: [PATCH 698/926] Fix mismatched param names in
 vpx_dsp/arm/highbd_sad4d_neon.c

Change-Id: Ia4918eb0bac3b28b27e1ef205b9171680b2eb9a4
---
 vpx_dsp/arm/highbd_sad4d_neon.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c
index 62c4685a7a..a6684b0534 100644
--- a/vpx_dsp/arm/highbd_sad4d_neon.c
+++ b/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -213,10 +213,11 @@ static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
 }
 
 #define HBD_SAD_WXH_4D_NEON(w, h)                                            \
-  void vpx_highbd_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
-                                         const uint8_t *const ref[4],        \
-                                         int ref_stride, uint32_t res[4]) {  \
-    highbd_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));   \
+  void vpx_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride,        \
+                              sad_array, (h));                               \
   }
 
 HBD_SAD_WXH_4D_NEON(4, 4)
@@ -239,16 +240,16 @@ HBD_SAD_WXH_4D_NEON(64, 64)
 
 #undef HBD_SAD_WXH_4D_NEON
 
-#define HBD_SAD_SKIP_WXH_4D_NEON(w, h)                                       \
-  void vpx_highbd_sad_skip_##w##x##h##x4d_neon(                              \
-      const uint8_t *src, int src_stride, const uint8_t *const ref[4],       \
-      int ref_stride, uint32_t res[4]) {                                     \
-    highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \
-                              ((h) >> 1));                                   \
-    res[0] <<= 1;                                                            \
-    res[1] <<= 1;                                                            \
-    res[2] <<= 1;                                                            \
-    res[3] <<= 1;                                                            \
+#define HBD_SAD_SKIP_WXH_4D_NEON(w, h)                                        \
+  void vpx_highbd_sad_skip_##w##x##h##x4d_neon(                               \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, uint32_t sad_array[4]) {                                \
+    highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+                              sad_array, ((h) >> 1));                         \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
   }
 
 HBD_SAD_SKIP_WXH_4D_NEON(4, 4)

From 2c03388231cf545e1245746a6ee4edfe0322e71e Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Wed, 3 May 2023 14:45:13 -0700
Subject: [PATCH 699/926] Fix mismatched param names in
 vpx_dsp/x86/sad4d_avx2.c

Change-Id: I226215a2ff8798b72abe0c2caf3d18875595caa5
---
 vpx_dsp/x86/sad4d_avx2.c | 60 ++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index c87fd3cd27..cf7111983b 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -135,45 +135,45 @@ static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums, sad_array);
 }
 
-#define SAD64_H(h)                                                          \
-  void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,          \
-                               const uint8_t *const ref[4], int ref_stride, \
-                               uint32_t res[4]) {                           \
-    sad64xhx4d_avx2(src, src_stride, ref, ref_stride, h, res);              \
+#define SAD64_H(h)                                                         \
+  void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[4],          \
+                               int ref_stride, uint32_t sad_array[4]) {    \
+    sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
   }
 
-#define SAD32_H(h)                                                          \
-  void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,          \
-                               const uint8_t *const ref[4], int ref_stride, \
-                               uint32_t res[4]) {                           \
-    sad32xhx4d_avx2(src, src_stride, ref, ref_stride, h, res);              \
+#define SAD32_H(h)                                                         \
+  void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[4],          \
+                               int ref_stride, uint32_t sad_array[4]) {    \
+    sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
   }
 
 SAD64_H(64)
 SAD32_H(32)
 
-#define SADS64_H(h)                                                       \
-  void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,  \
-                                     const uint8_t *const ref[4],         \
-                                     int ref_stride, uint32_t res[4]) {   \
-    sad64xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \
-                    res);                                                 \
-    res[0] <<= 1;                                                         \
-    res[1] <<= 1;                                                         \
-    res[2] <<= 1;                                                         \
-    res[3] <<= 1;                                                         \
+#define SADS64_H(h)                                                           \
+  void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
+                    ((h) >> 1), sad_array);                                   \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
   }
 
-#define SADS32_H(h)                                                       \
-  void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,  \
-                                     const uint8_t *const ref[4],         \
-                                     int ref_stride, uint32_t res[4]) {   \
-    sad32xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \
-                    res);                                                 \
-    res[0] <<= 1;                                                         \
-    res[1] <<= 1;                                                         \
-    res[2] <<= 1;                                                         \
-    res[3] <<= 1;                                                         \
+#define SADS32_H(h)                                                           \
+  void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
+                    ((h) >> 1), sad_array);                                   \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
   }
 
 SADS64_H(64)

From de45e4b612bb576f76d35770afc62ae799e6c0fd Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 28 Apr 2023 14:32:47 -0400
Subject: [PATCH 700/926] Add codec control to export TPL stats

new codec control: VP9E_GET_TPL_STATS with unit test

Bug: b/273736974
Change-Id: I27343bd3f6dffafc86925234537bcdb557bc4079
---
 test/encode_api_test.cc     | 80 ++++++++++++++++++++++++++++++++++++-
 test/encode_test_driver.h   |  5 +++
 vp9/encoder/vp9_encoder.h   |  1 +
 vp9/encoder/vp9_tpl_model.c |  2 +
 vp9/vp9_cx_iface.c          | 18 +++++++++
 vpx/vp8cx.h                 | 14 +++++++
 vpx/vpx_encoder.h           |  2 +-
 7 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index ecdf928343..eac9626052 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -13,9 +13,12 @@
 #include <initializer_list>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
 
 #include "./vpx_config.h"
-#include "test/video_source.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
@@ -360,4 +363,79 @@ TEST(EncodeAPI, ConfigChangeThreadCount) {
   }
 }
 
+#if CONFIG_VP9_ENCODER
+class EncodeApiGetTplStatsTest
+    : public ::libvpx_test::EncoderTest,
+      public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
+ public:
+  EncodeApiGetTplStatsTest() : EncoderTest(GetParam()) {}
+  ~EncodeApiGetTplStatsTest() override {}
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_TPL, 1);
+    }
+  }
+
+  vpx_codec_err_t AllocateTplList(TplFrameStats **data) {
+    // Allocate MAX_ARF_GOP_SIZE * sizeof(TplFrameStats) that will be filled
+    // by VP9E_GET_TPL_STATS
+    *data = static_cast<TplFrameStats *>(calloc(50, sizeof(TplFrameStats)));
+    if (*data == nullptr) return VPX_CODEC_MEM_ERROR;
+    return VPX_CODEC_OK;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      switch (pkt->kind) {
+        case VPX_CODEC_CX_FRAME_PKT: {
+          TplFrameStats *tpl_stats = NULL;
+          EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK);
+          encoder->Control(VP9E_GET_TPL_STATS, tpl_stats);
+          bool stats_not_all_zero = false;
+          for (unsigned int i = 0; i < cfg_.g_lag_in_frames; i++) {
+            if (tpl_stats[i].frame_width != 0) {
+              ASSERT_EQ(tpl_stats[i].frame_width, width_);
+              ASSERT_EQ(tpl_stats[i].frame_height, height_);
+              ASSERT_NE(tpl_stats[i].block_stats_list, nullptr);
+              stats_not_all_zero = true;
+            }
+          }
+          ASSERT_TRUE(stats_not_all_zero);
+          // Free the memory right away now as this is only a test.
+          free(tpl_stats);
+          break;
+        }
+        default: break;
+      }
+    }
+  }
+
+  int width_;
+  int height_;
+};
+
+TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
+  cfg_.g_lag_in_frames = 25;
+  width_ = 352;
+  height_ = 288;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
+                                       height_, 30, 1, 0, 150);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    VP9, EncodeApiGetTplStatsTest,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif  // CONFIG_VP9_ENCODER
+
 }  // namespace
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index b57df85291..27a78e68d2 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -153,6 +153,11 @@ class Encoder {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
+
+  void Control(int ctrl_id, TplFrameStats *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
 #endif  // CONFIG_VP9_ENCODER
 
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 7c22c807b7..742cf0b6dd 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -505,6 +505,7 @@ typedef struct EncFrameBuf {
 } EncFrameBuf;
 
 // Maximum operating frame buffer size needed for a GOP using ARF reference.
+// This is used to allocate the memory for TPL stats for a GOP.
 #define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS)
 #define MAX_KMEANS_GROUPS 8
 
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index dbd7482b0d..b62c66b6ce 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -1134,6 +1134,8 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
 
+  tpl_frame_stats_before_propagation->frame_width = cm->width;
+  tpl_frame_stats_before_propagation->frame_height = cm->height;
   // Setup scaling factor
 #if CONFIG_VP9_HIGHBITDEPTH
   vp9_setup_scale_factors_for_frame(
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 4c7eaed725..8bd880c7b5 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1788,6 +1788,23 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  TplFrameStats *data = va_arg(args, TplFrameStats *);
+  int i;
+  if (data == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  for (i = 0; i < MAX_ARF_GOP_SIZE; i++) {
+    data[i].frame_width = cpi->tpl_frame_stats[i].frame_width;
+    data[i].frame_height = cpi->tpl_frame_stats[i].frame_height;
+    data[i].block_stats_list = cpi->tpl_frame_stats[i].block_stats_list;
+  }
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
@@ -2035,6 +2052,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
   { VP9E_GET_LEVEL, ctrl_get_level },
   { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config },
+  { VP9E_GET_TPL_STATS, ctrl_get_tpl_stats },
 
   { -1, NULL },
 };
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index e0b679fbb7..123a645d91 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -767,6 +767,18 @@ enum vp8e_enc_control_id {
    *
    */
   VP9E_SET_QUANTIZER_ONE_PASS,
+
+  /*!\brief Codec control to get TPL stats for the current GOP.
+   *
+   * Allocation and free of memory of size MAX_ARF_GOP_SIZE (50) *
+   * sizeof(TplFrameStats) should be done by applications.
+   *
+   * VPX_CODEC_INVALID_PARAM will be returned if the pointer passed in is NULL.
+   *
+   * Supported in codecs: VP9
+   *
+   */
+  VP9E_GET_TPL_STATS,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -1097,6 +1109,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
 VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
 #define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
+VPX_CTRL_USE_TYPE(VP9E_GET_TPL_STATS, void *)
+#define VPX_CTRL_VP9E_GET_TPL_STATS
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 9247231328..1910a19040 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -58,7 +58,7 @@ extern "C" {
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (15 + VPX_CODEC_ABI_VERSION + \
+  (16 + VPX_CODEC_ABI_VERSION + \
    VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield

From f059f9ee2df114ffa475aaef064f93396ebf39cc Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 4 May 2023 10:32:03 -0400
Subject: [PATCH 701/926] Add Vpx* prefix to Tpl{Block,Frame}Stats

This is to avoid symbols redifinition when integrating with other
libraries.

Bug: b/273736974
Change-Id: I891af78b1907504d5bb9f735164aea18c2aba944
---
 test/encode_api_test.cc     |  9 +++++----
 test/encode_test_driver.h   |  2 +-
 vp9/encoder/vp9_encoder.h   |  2 +-
 vp9/encoder/vp9_tpl_model.c |  8 ++++----
 vp9/vp9_cx_iface.c          |  2 +-
 vpx/vp8cx.h                 |  2 +-
 vpx/vpx_encoder.h           | 10 +++++-----
 7 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index eac9626052..0514cd828f 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -384,10 +384,11 @@ class EncodeApiGetTplStatsTest
     }
   }
 
-  vpx_codec_err_t AllocateTplList(TplFrameStats **data) {
-    // Allocate MAX_ARF_GOP_SIZE * sizeof(TplFrameStats) that will be filled
+  vpx_codec_err_t AllocateTplList(VpxTplFrameStats **data) {
+    // Allocate MAX_ARF_GOP_SIZE * sizeof(VpxTplFrameStats) that will be filled
     // by VP9E_GET_TPL_STATS
-    *data = static_cast<TplFrameStats *>(calloc(50, sizeof(TplFrameStats)));
+    *data =
+        static_cast<VpxTplFrameStats *>(calloc(50, sizeof(VpxTplFrameStats)));
     if (*data == nullptr) return VPX_CODEC_MEM_ERROR;
     return VPX_CODEC_OK;
   }
@@ -397,7 +398,7 @@ class EncodeApiGetTplStatsTest
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
       switch (pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT: {
-          TplFrameStats *tpl_stats = NULL;
+          VpxTplFrameStats *tpl_stats = NULL;
           EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK);
           encoder->Control(VP9E_GET_TPL_STATS, tpl_stats);
           bool stats_not_all_zero = false;
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 27a78e68d2..a5cd8306ef 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -154,7 +154,7 @@ class Encoder {
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
-  void Control(int ctrl_id, TplFrameStats *arg) {
+  void Control(int ctrl_id, VpxTplFrameStats *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 742cf0b6dd..cca1617830 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -745,7 +745,7 @@ typedef struct VP9_COMP {
   BLOCK_SIZE tpl_bsize;
   TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
   // Used to store TPL stats before propagation
-  TplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE];
+  VpxTplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE];
   YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
   EncFrameBuf enc_frame_buf[REF_FRAMES];
 #if CONFIG_MULTITHREAD
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index b62c66b6ce..ea5d61e326 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -154,7 +154,7 @@ static void init_tpl_stats(VP9_COMP *cpi) {
   int frame_idx;
   for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
     TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    TplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx];
+    VpxTplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx];
     memset(tpl_frame->tpl_stats_ptr, 0,
            tpl_frame->height * tpl_frame->width *
                sizeof(*tpl_frame->tpl_stats_ptr));
@@ -356,7 +356,7 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
   }
 }
 
-static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
+static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
                                          TplDepStats *tpl_stats, int mi_row,
                                          int mi_col, BLOCK_SIZE bsize,
                                          int stride, int64_t recon_error,
@@ -368,7 +368,7 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats,
 
   for (idy = 0; idy < mi_height; ++idy) {
     for (idx = 0; idx < mi_width; ++idx) {
-      TplBlockStats *tpl_block_stats_ptr =
+      VpxTplBlockStats *tpl_block_stats_ptr =
           &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
       tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
       tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
@@ -1105,7 +1105,7 @@ static void build_motion_field(
 static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
                               int frame_idx, BLOCK_SIZE bsize) {
   TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  TplFrameStats *tpl_frame_stats_before_propagation =
+  VpxTplFrameStats *tpl_frame_stats_before_propagation =
       &cpi->tpl_frame_stats[frame_idx];
   YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
   YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 8bd880c7b5..66efba181f 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1791,7 +1791,7 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
-  TplFrameStats *data = va_arg(args, TplFrameStats *);
+  VpxTplFrameStats *data = va_arg(args, VpxTplFrameStats *);
   int i;
   if (data == NULL) {
     return VPX_CODEC_INVALID_PARAM;
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 123a645d91..c4e04084c8 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -771,7 +771,7 @@ enum vp8e_enc_control_id {
   /*!\brief Codec control to get TPL stats for the current GOP.
    *
    * Allocation and free of memory of size MAX_ARF_GOP_SIZE (50) *
-   * sizeof(TplFrameStats) should be done by applications.
+   * sizeof(VpxTplFrameStats) should be done by applications.
    *
    * VPX_CODEC_INVALID_PARAM will be returned if the pointer passed in is NULL.
    *
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 1910a19040..66c5a68b86 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -253,7 +253,7 @@ enum vpx_kf_mode {
 };
 
 /*!\brief Temporal dependency model stats for each block before propagation */
-typedef struct TplBlockStats {
+typedef struct VpxTplBlockStats {
   int64_t intra_cost;  /**< Intra cost */
   int64_t inter_cost;  /**< Inter cost */
   int16_t mv_r;        /**< Motion vector row */
@@ -261,15 +261,15 @@ typedef struct TplBlockStats {
   int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
   int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
   int ref_frame_index; /**< Ref frame index */
-} TplBlockStats;
+} VpxTplBlockStats;
 
 /*!\brief Temporal dependency model stats for each frame before propagation */
-typedef struct TplFrameStats {
+typedef struct VpxTplFrameStats {
   int frame_width;  /**< Frame width */
   int frame_height; /**< Frame height */
   // Size of the list can be calculated from frame_width and frame_height.
-  TplBlockStats *block_stats_list; /**< List of tpl stats for each block */
-} TplFrameStats;
+  VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} VpxTplFrameStats;
 
 /*!\brief Encoded Frame Flags
  *

From 2e5261647f0440e78081f9bfaaaa5ef9d10140d6 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 4 May 2023 10:59:46 -0400
Subject: [PATCH 702/926] Add num_blocks to VpxTplFrameStats

I realized the calculation of the size of the list of VpxTplBlockStats
is non-trivial. So it's better to add the field for the size.

Bug: b/273736974
Change-Id: Ic1b50597c1f89a8f866b5669ca676407be6dc9d8
---
 test/encode_api_test.cc     | 1 +
 vp9/encoder/vp9_tpl_model.c | 1 +
 vp9/vp9_cx_iface.c          | 1 +
 vpx/vpx_encoder.h           | 2 +-
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 0514cd828f..e435ed872f 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -406,6 +406,7 @@ class EncodeApiGetTplStatsTest
             if (tpl_stats[i].frame_width != 0) {
               ASSERT_EQ(tpl_stats[i].frame_width, width_);
               ASSERT_EQ(tpl_stats[i].frame_height, height_);
+              ASSERT_GT(tpl_stats[i].num_blocks, 0);
               ASSERT_NE(tpl_stats[i].block_stats_list, nullptr);
               stats_not_all_zero = true;
             }
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index ea5d61e326..ed771dcb4b 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -1354,6 +1354,7 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) {
         cm, cpi->tpl_frame_stats[frame].block_stats_list,
         vpx_calloc(mi_rows * mi_cols,
                    sizeof(*cpi->tpl_frame_stats[frame].block_stats_list)));
+    cpi->tpl_frame_stats[frame].num_blocks = mi_rows * mi_cols;
     cpi->tpl_stats[frame].is_valid = 0;
     cpi->tpl_stats[frame].width = mi_cols;
     cpi->tpl_stats[frame].height = mi_rows;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 66efba181f..e264ae9bd9 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1799,6 +1799,7 @@ static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
   for (i = 0; i < MAX_ARF_GOP_SIZE; i++) {
     data[i].frame_width = cpi->tpl_frame_stats[i].frame_width;
     data[i].frame_height = cpi->tpl_frame_stats[i].frame_height;
+    data[i].num_blocks = cpi->tpl_frame_stats[i].num_blocks;
     data[i].block_stats_list = cpi->tpl_frame_stats[i].block_stats_list;
   }
 
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 66c5a68b86..a7f1552de0 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -267,7 +267,7 @@ typedef struct VpxTplBlockStats {
 typedef struct VpxTplFrameStats {
   int frame_width;  /**< Frame width */
   int frame_height; /**< Frame height */
-  // Size of the list can be calculated from frame_width and frame_height.
+  int num_blocks;   /**< Number of blocks. Size of block_stats_list */
   VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
 } VpxTplFrameStats;
 

From 4e23e7abfe02f7030f193327bc9adacfefaef991 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 4 May 2023 17:17:10 -0700
Subject: [PATCH 703/926] vpx_subpixel_8t_intrin_avx2,cosmetics: shorten long
 comment

Change-Id: I8badedc2ad07d60896e45de28b707ad9f6c4d499
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 9ff67bd301..2498bba173 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -1078,7 +1078,7 @@ static void vpx_filter_block1d4_h8_avx2(
     // f4|f7 f6 f5 f4|f7 f6 f5 f4
     secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
 
-    // s6  s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
+    // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
     // s2 s4 s3 s2 s1 s3 s2 s1 s0
     filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
 
@@ -1102,7 +1102,7 @@ static void vpx_filter_block1d4_h8_avx2(
       srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
 
       // multiply 4 adjacent elements with the filter and add the result
-      // .....|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||.........
+      // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||...
       // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00
       srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
 

From 601a98b1542fa1fb439c715a9e2e8559338d33f8 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 23 Mar 2023 14:50:10 -0700
Subject: [PATCH 704/926] Add comments about vpx_codec_enc_init_ver failure

Address the questions:
1. If vpx_codec_enc_init_ver() fails, should I still call
   vpx_codec_destroy() on the encoder context?
2. Is it safe to call vpx_codec_error_detail() when
   vpx_codec_enc_init_ver() failed?

Change-Id: I1b0e090d11dd9f853fe203f4cbb6080c3c7b0506
---
 vp9/vp9_cx_iface.c    | 2 ++
 vpx/src/vpx_encoder.c | 4 ++++
 vpx/vpx_encoder.h     | 5 ++++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index e264ae9bd9..f067efdf79 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -129,6 +129,8 @@ struct vpx_codec_alg_priv {
   BufferPool *buffer_pool;
 };
 
+// Called by encoder_set_config() and encoder_encode() only. Must not be called
+// by encoder_init().
 static vpx_codec_err_t update_error_state(
     vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
   const vpx_codec_err_t res = error->error_code;
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 846638fe55..0d6e48015a 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -54,6 +54,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
     res = ctx->iface->init(ctx, NULL);
 
     if (res) {
+      // IMPORTANT: ctx->priv->err_detail must be null or point to a string
+      // that remains valid after ctx->priv is destroyed, such as a C string
+      // literal. This makes it safe to call vpx_codec_error_detail() after
+      // vpx_codec_enc_init_ver() failed.
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index a7f1552de0..2de8089736 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -877,7 +877,7 @@ typedef struct vpx_svc_parameters {
 
 /*!\brief Initialize an encoder instance
  *
- * Initializes a encoder context using the given interface. Applications
+ * Initializes an encoder context using the given interface. Applications
  * should call the vpx_codec_enc_init convenience macro instead of this
  * function directly, to ensure that the ABI version number parameter
  * is properly initialized.
@@ -886,6 +886,9 @@ typedef struct vpx_svc_parameters {
  * is not thread safe and should be guarded with a lock if being used
  * in a multithreaded context.
  *
+ * If vpx_codec_enc_init_ver() fails, it is not necessary to call
+ * vpx_codec_destroy() on the encoder context.
+ *
  * \param[in]    ctx     Pointer to this instance's context.
  * \param[in]    iface   Pointer to the algorithm interface to use.
  * \param[in]    cfg     Configuration to use, if known. May be NULL.

From 8e47341b0ea4bf2a37f968cf260d6dbfd1f0062a Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 28 Mar 2023 18:07:41 -0700
Subject: [PATCH 705/926] Have vpx_codec_error take const vpx_codec_ctx_t *

Also have vpx_codec_error_detail take vpx_codec_ctx_t *. Both functions
are getter functions that don't modify the codec context.

Change-Id: I4689022425efbf7b1da5034255ac052fce5e5b4f
---
 vpx/src/vpx_codec.c | 6 +++---
 vpx/vpx_codec.h     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c
index 114b94e194..24528d860a 100644
--- a/vpx/src/vpx_codec.c
+++ b/vpx/src/vpx_codec.c
@@ -50,12 +50,12 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err) {
   return "Unrecognized error code";
 }
 
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx) {
+const char *vpx_codec_error(const vpx_codec_ctx_t *ctx) {
   return (ctx) ? vpx_codec_err_to_string(ctx->err)
                : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM);
 }
 
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) {
+const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx) {
   if (ctx && ctx->err)
     return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
 
@@ -82,7 +82,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) {
 }
 
 vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) {
-  return (iface) ? iface->caps : 0;
+  return iface ? iface->caps : 0;
 }
 
 vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) {
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 11bf8aaa22..ca18d90cb7 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -318,7 +318,7 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err);
  * \param[in]    ctx     Pointer to this instance's context.
  *
  */
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx);
+const char *vpx_codec_error(const vpx_codec_ctx_t *ctx);
 
 /*!\brief Retrieve detailed error information for codec context
  *
@@ -330,7 +330,7 @@ const char *vpx_codec_error(vpx_codec_ctx_t *ctx);
  * \retval NULL
  *     No detailed information is available.
  */
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
+const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx);
 
 /* REQUIRED FUNCTIONS
  *

From 3d6b86e7045481c55b35d0daa4f19202bbe99dc1 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 24 Mar 2023 11:32:36 -0700
Subject: [PATCH 706/926] Overwrite cm->error->detail before freeing

Help detect use after free of the return value of
vpx_codec_error_detail(). If vpx_codec_error_detail() is called after
vpx_codec_encode() fails, the return value may be equal to
cm->error->detail, which is freed when vpx_codec_destroy() is called.

Document the lifetime of the string returned by
vpx_codec_error_detail().

Change-Id: I8089e90a4499b4f3cc5b9cfdbb25d72368faa319
---
 vp9/encoder/vp9_encoder.c | 5 +++++
 vpx/vpx_codec.h           | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 662ec24b83..f76eec2b57 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
@@ -2873,6 +2874,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 
   vp9_extrc_delete(&cpi->ext_ratectrl);
 
+  // Help detect use after free of the error detail string.
+  memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1);
+  cm->error.detail[sizeof(cm->error.detail) - 1] = '\0';
+
   vp9_remove_common(cm);
   vp9_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index ca18d90cb7..0d61b07381 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -323,7 +323,9 @@ const char *vpx_codec_error(const vpx_codec_ctx_t *ctx);
 /*!\brief Retrieve detailed error information for codec context
  *
  * Returns a human readable string providing detailed information about
- * the last error.
+ * the last error. The returned string is only valid until the next
+ * vpx_codec_* function call (except vpx_codec_error and
+ * vpx_codec_error_detail) on the codec context.
  *
  * \param[in]    ctx     Pointer to this instance's context.
  *

From 255ee1888589aa15ae909b992fe123c0358b1730 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Tue, 18 Apr 2023 14:46:56 +0530
Subject: [PATCH 707/926] Add AVX2 intrinsic for idct16x16 and idct32x32
 functions

Added AVX2 intrinsic optimization for the following functions
1. vpx_idct16x16_256_add
2. vpx_idct32x32_1024_add
3. vpx_idct32x32_135_add

The module level scaling w.r.t C function (timer based) for
existing (SSE2) and new AVX2 intrinsics:
                            Scaling
   Function Name         SSE2      AVX2
vpx_idct32x32_1024_add  3.62x     7.49x
vpx_idct32x32_135_add   4.85x     9.41x
vpx_idct16x16_256_add   4.82x     7.70x

This is a bit-exact change.

Change-Id: Id9dda933aa1f5093bb6b35ac3b8a41846afca9d2
---
 test/dct16x16_test.cc        |  98 +++++-
 test/dct32x32_test.cc        | 197 +++++++++++
 vp9/common/vp9_idct.c        |   2 +
 vp9/decoder/vp9_decoder.c    |   2 +-
 vp9/decoder/vp9_decoder.h    |   2 +-
 vpx_dsp/vpx_dsp.mk           |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   6 +-
 vpx_dsp/x86/inv_txfm_avx2.c  | 626 +++++++++++++++++++++++++++++++++++
 8 files changed, 926 insertions(+), 8 deletions(-)
 create mode 100644 vpx_dsp/x86/inv_txfm_avx2.c

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 3c104f3a44..4ad2263cfc 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -592,7 +592,7 @@ class Trans16x16TestBase {
     const int count_test_block = 10000;
     const int eob = 10;
     const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
     DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
     DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -643,6 +643,80 @@ class Trans16x16TestBase {
     }
   }
 
+  void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    int64_t c_sum_time = 0;
+    int64_t simd_sum_time = 0;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (j < eob) {
+        // Random values less than the threshold, either positive or negative
+        coeff[scan[j]] = rnd(thresh);
+      } else {
+        coeff[scan[j]] = 0;
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        dst[j] = 0;
+        ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        dst16[j] = 0;
+        ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+    if (bit_depth_ == VPX_BITS_8) {
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        ref_txfm(coeff, ref, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, dst, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    printf(
+        "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n",
+        c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   int pitch_;
   int tx_type_;
   vpx_bit_depth_t bit_depth_;
@@ -755,7 +829,6 @@ TEST_P(Trans16x16HT, QuantCheck) {
   RunQuantCheck(429, 729);
 }
 
-#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 class InvTrans16x16DCT : public Trans16x16TestBase,
                          public ::testing::TestWithParam<Idct16x16Param> {
  public:
@@ -786,7 +859,10 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT);
 TEST_P(InvTrans16x16DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
-#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+TEST_P(InvTrans16x16DCT, DISABLED_Speed) {
+  RunInvTrans16x16SpeedTest(ref_txfm_, thresh_);
+}
 
 using std::make_tuple;
 
@@ -828,6 +904,12 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      6225, VPX_BITS_8)));
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -862,6 +944,11 @@ INSTANTIATE_TEST_SUITE_P(
                                  2, VPX_BITS_8),
                       make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
                                  3, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(
+                             &vpx_idct16x16_256_add_c,
+                             &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -869,6 +956,11 @@ INSTANTIATE_TEST_SUITE_P(
     AVX2, Trans16x16DCT,
     ::testing::Values(make_tuple(&vpx_fdct16x16_avx2,
                                  &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(
+                             &vpx_idct16x16_256_add_c,
+                             &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8)));
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 91bb8e01ea..1167038b5f 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -24,10 +24,12 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -71,6 +73,9 @@ typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef std::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
     Trans32x32Param;
 
+typedef std::tuple<InvTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t, int, int>
+    InvTrans32x32Param;
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
@@ -314,6 +319,174 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
   }
 }
 
+class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> {
+ public:
+  virtual ~InvTrans32x32Test() {}
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_ = GET_PARAM(2);  // 0: high precision forward transform
+                              // 1: low precision version for rd loop
+    bit_depth_ = GET_PARAM(3);
+    eob_ = GET_PARAM(4);
+    thresh_ = GET_PARAM(4);
+    mask_ = (1 << bit_depth_) - 1;
+    pitch_ = 32;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    ref_txfm_(out, dst, stride);
+  }
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  int version_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  int eob_;
+  int thresh_;
+
+  InvTxfmFunc ref_txfm_;
+  InvTxfmFunc inv_txfm_;
+  int pitch_;
+
+  void RunInvTrans32x32SpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int64_t c_sum_time = 0;
+    int64_t simd_sum_time = 0;
+    const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (j < eob_) {
+        // Random values less than the threshold, either positive or negative
+        coeff[scan[j]] = rnd(thresh_);
+      } else {
+        coeff[scan[j]] = 0;
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        dst[j] = 0;
+        ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        dst16[j] = 0;
+        ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+    if (bit_depth_ == VPX_BITS_8) {
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunRefTxfm(coeff, ref, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, dst, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    printf(
+        "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n",
+        c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
+  void CompareInvReference32x32() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 31;
+    const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          coeff[scan[j]] = rnd.Rand8Extremes();
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        RunRefTxfm(coeff, ref, pitch_);
+        RunInvTxfm(coeff, dst, pitch_);
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error "
+                             << error << " at index " << j;
+      }
+    }
+  }
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test);
+
+TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); }
+TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); }
+
 using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -334,6 +507,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c,
                                  1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    C, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0,
+                   VPX_BITS_8, 16, 6255)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
@@ -352,6 +533,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_sse2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0,
+                   VPX_BITS_8, 16, 6225)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -377,6 +566,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_avx2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0,
+                   VPX_BITS_8, 16, 6225)));
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 69069042cc..71be0f310d 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -150,6 +150,7 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
+  assert(((intptr_t)input) % 32 == 0);
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob == 1) /* DC only DCT coefficient. */
@@ -164,6 +165,7 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
+  assert(((intptr_t)input) % 32 == 0);
   if (eob == 1)
     vpx_idct32x32_1_add(input, dest, stride);
   else if (eob <= 34)
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 7db8ed72d5..92cd91f1e3 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -87,7 +87,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
   row_mt_worker_data->num_sbs = num_sbs;
   for (plane = 0; plane < 3; ++plane) {
     CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
-                    vpx_memalign(16, dqcoeff_size));
+                    vpx_memalign(32, dqcoeff_size));
     memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
     CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
                     vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index b0ef83c73d..2e198d552e 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -54,7 +54,7 @@ typedef struct TileWorkerData {
   VP9LfSync *lf_sync;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
   DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 207cda6310..67d3fb0e29 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -252,6 +252,7 @@ DSP_SRCS-yes            += inv_txfm.h
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/inv_txfm_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index bde0115298..a872d17973 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -641,12 +641,12 @@ ()
   specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
   specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
   specialize qw/vpx_idct8x8_1_add neon sse2/;
-  specialize qw/vpx_idct16x16_256_add neon sse2 vsx/;
+  specialize qw/vpx_idct16x16_256_add neon sse2 avx2 vsx/;
   specialize qw/vpx_idct16x16_38_add neon sse2/;
   specialize qw/vpx_idct16x16_10_add neon sse2/;
   specialize qw/vpx_idct16x16_1_add neon sse2/;
-  specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/;
-  specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
+  specialize qw/vpx_idct32x32_1024_add neon sse2 avx2 vsx/;
+  specialize qw/vpx_idct32x32_135_add neon sse2 ssse3 avx2/;
   specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_1_add neon sse2/;
   specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
diff --git a/vpx_dsp/x86/inv_txfm_avx2.c b/vpx_dsp/x86/inv_txfm_avx2.c
new file mode 100644
index 0000000000..752435d240
--- /dev/null
+++ b/vpx_dsp/x86/inv_txfm_avx2.c
@@ -0,0 +1,626 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in,
+                                  int stride) {
+  int i;
+  // Load 16x16 values
+  for (i = 0; i < 16; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride));
+    const __m128i in1 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 4));
+    const __m128i in2 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 8));
+    const __m128i in3 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 12));
+    const __m128i ls = _mm_packs_epi32(in0, in1);
+    const __m128i rs = _mm_packs_epi32(in2, in3);
+    in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1);
+#else
+    in[i] = _mm256_load_si256((const __m256i *)(input + i * stride));
+#endif
+  }
+}
+
+static INLINE __m256i dct_round_shift_avx2(__m256i in) {
+  const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING));
+  return _mm256_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) {
+  const __m256i t = _mm256_madd_epi16(*in, *cospi);
+  return dct_round_shift_avx2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1,
+                                             __m256i *x) {
+  const __m256i t0 = idct_madd_round_shift_avx2(in0, x);
+  const __m256i t1 = idct_madd_round_shift_avx2(in1, x);
+  return _mm256_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1,
+                               __m256i *out0, __m256i *out1) {
+  __m256i cst0 = PAIR256_SET_EPI16(c0, -c1);
+  __m256i cst1 = PAIR256_SET_EPI16(c1, c0);
+  __m256i lo = _mm256_unpacklo_epi16(in0, in1);
+  __m256i hi = _mm256_unpackhi_epi16(in0, in1);
+  *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0);
+  *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1);
+}
+
+static INLINE void idct16_16col(__m256i *in, __m256i *out) {
+  __m256i step1[16], step2[16];
+
+  // stage 2
+  butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+  step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm256_add_epi16(step2[10], step2[11]);
+  step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm256_add_epi16(step2[14], step2[15]);
+
+  // stage 4
+  butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+              &step2[14]);
+  butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+              &step2[10]);
+  step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+  step1[4] = _mm256_add_epi16(step1[4], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+  step1[7] = _mm256_add_epi16(step1[6], step1[7]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+  butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+              &step1[6]);
+  step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm256_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm256_add_epi16(step1[1], step1[6]);
+  step2[2] = _mm256_add_epi16(step1[2], step1[5]);
+  step2[3] = _mm256_add_epi16(step1[3], step1[4]);
+  step2[4] = _mm256_sub_epi16(step1[3], step1[4]);
+  step2[5] = _mm256_sub_epi16(step1[2], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[1], step1[6]);
+  step2[7] = _mm256_sub_epi16(step1[0], step1[7]);
+  butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+              &step2[13]);
+  butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+              &step2[12]);
+
+  // stage 7
+  out[0] = _mm256_add_epi16(step2[0], step1[15]);
+  out[1] = _mm256_add_epi16(step2[1], step1[14]);
+  out[2] = _mm256_add_epi16(step2[2], step2[13]);
+  out[3] = _mm256_add_epi16(step2[3], step2[12]);
+  out[4] = _mm256_add_epi16(step2[4], step2[11]);
+  out[5] = _mm256_add_epi16(step2[5], step2[10]);
+  out[6] = _mm256_add_epi16(step2[6], step1[9]);
+  out[7] = _mm256_add_epi16(step2[7], step1[8]);
+  out[8] = _mm256_sub_epi16(step2[7], step1[8]);
+  out[9] = _mm256_sub_epi16(step2[6], step1[9]);
+  out[10] = _mm256_sub_epi16(step2[5], step2[10]);
+  out[11] = _mm256_sub_epi16(step2[4], step2[11]);
+  out[12] = _mm256_sub_epi16(step2[3], step2[12]);
+  out[13] = _mm256_sub_epi16(step2[2], step2[13]);
+  out[14] = _mm256_sub_epi16(step2[1], step1[14]);
+  out[15] = _mm256_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) {
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest)));
+  d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+  d0 = _mm256_unpacklo_epi8(d0, zero);
+  d0 = _mm256_add_epi16(in_x, d0);
+  d0 = _mm256_packus_epi16(
+      d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1)));
+
+  _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0));
+}
+
+static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) {
+  const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+  __m256i out;
+  out = _mm256_adds_epi16(in, final_rounding);
+  out = _mm256_srai_epi16(out, 6);
+  recon_and_store16(dest, out);
+}
+
+static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) {
+  const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+  int j = 0;
+  while (j < 32) {
+    in[j] = _mm256_adds_epi16(in[j], final_rounding);
+    in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding);
+
+    in[j] = _mm256_srai_epi16(in[j], 6);
+    in[j + 1] = _mm256_srai_epi16(in[j + 1], 6);
+
+    recon_and_store16(dst, in[j]);
+    dst += stride;
+    recon_and_store16(dst, in[j + 1]);
+    dst += stride;
+    j += 2;
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + (idx)] =                                                             \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + (idx)] = _mm256_inserti128_si256(                                    \
+      t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int i;
+  __m256i in[16];
+
+  // Load 16x16 values
+  idct_load16x16(input, in, 16);
+
+  transpose_16bit_16x16_avx2(in, in);
+  idct16_16col(in, in);
+
+  transpose_16bit_16x16_avx2(in, in);
+  idct16_16col(in, in);
+
+  for (i = 0; i < 16; ++i) {
+    write_buffer_16x1(dest + i * stride, in[i]);
+  }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm256_add_epi16(in[i], in[bound - i]);
+    out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) {
+  __m256i step1[8], step2[8];
+
+  // stage 3
+  butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 4
+  butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm256_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm256_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+              &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm256_add_epi16(step1[0], step1[7]);
+  out[1] = _mm256_add_epi16(step1[1], step1[6]);
+  out[2] = _mm256_add_epi16(step1[2], step1[5]);
+  out[3] = _mm256_add_epi16(step1[3], step1[4]);
+  out[4] = _mm256_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm256_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm256_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm256_sub_epi16(step1[0], step1[7]);
+}
+
+static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1,
+                                                       __m256i *out) {
+  __m256i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+              &step2[14]);
+  butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+              &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10],
+              &out[13]);
+  butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11],
+              &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) {
+  __m256i step1[16], step2[16];
+
+  // stage 2
+  butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm256_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[14]);
+
+  idct32_16x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1,
+                                                         __m256i *out) {
+  __m256i step2[32];
+
+  // stage 4
+  step2[16] = _mm256_add_epi16(step1[16], step1[19]);
+  step2[17] = _mm256_add_epi16(step1[17], step1[18]);
+  step2[18] = _mm256_sub_epi16(step1[17], step1[18]);
+  step2[19] = _mm256_sub_epi16(step1[16], step1[19]);
+  step2[20] = _mm256_sub_epi16(step1[23], step1[20]);
+  step2[21] = _mm256_sub_epi16(step1[22], step1[21]);
+  step2[22] = _mm256_add_epi16(step1[22], step1[21]);
+  step2[23] = _mm256_add_epi16(step1[23], step1[20]);
+
+  step2[24] = _mm256_add_epi16(step1[24], step1[27]);
+  step2[25] = _mm256_add_epi16(step1[25], step1[26]);
+  step2[26] = _mm256_sub_epi16(step1[25], step1[26]);
+  step2[27] = _mm256_sub_epi16(step1[24], step1[27]);
+  step2[28] = _mm256_sub_epi16(step1[31], step1[28]);
+  step2[29] = _mm256_sub_epi16(step1[30], step1[29]);
+  step2[30] = _mm256_add_epi16(step1[29], step1[30]);
+  step2[31] = _mm256_add_epi16(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+              &step1[29]);
+  butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+              &step1[28]);
+  butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+              &step1[27]);
+  butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+              &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  out[16] = _mm256_add_epi16(step1[16], step1[23]);
+  out[17] = _mm256_add_epi16(step1[17], step1[22]);
+  out[18] = _mm256_add_epi16(step1[18], step1[21]);
+  out[19] = _mm256_add_epi16(step1[19], step1[20]);
+  step2[20] = _mm256_sub_epi16(step1[19], step1[20]);
+  step2[21] = _mm256_sub_epi16(step1[18], step1[21]);
+  step2[22] = _mm256_sub_epi16(step1[17], step1[22]);
+  step2[23] = _mm256_sub_epi16(step1[16], step1[23]);
+
+  step2[24] = _mm256_sub_epi16(step1[31], step1[24]);
+  step2[25] = _mm256_sub_epi16(step1[30], step1[25]);
+  step2[26] = _mm256_sub_epi16(step1[29], step1[26]);
+  step2[27] = _mm256_sub_epi16(step1[28], step1[27]);
+  out[28] = _mm256_add_epi16(step1[27], step1[28]);
+  out[29] = _mm256_add_epi16(step1[26], step1[29]);
+  out[30] = _mm256_add_epi16(step1[25], step1[30]);
+  out[31] = _mm256_add_epi16(step1[24], step1[31]);
+
+  // stage 7
+  butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20],
+              &out[27]);
+  butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21],
+              &out[26]);
+  butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22],
+              &out[25]);
+  butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23],
+              &out[24]);
+}
+
+static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) {
+  __m256i temp[16];
+
+  // For each 16x32 block __m256i in[32],
+  // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+  // output pixels: 0-7 in __m256i out[32]
+  idct32_1024_16x32_quarter_1(in, temp);
+
+  // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+  // output pixels: 8-15 in __m256i out[32]
+  idct32_1024_16x32_quarter_2(in, temp);
+
+  // stage 7
+  add_sub_butterfly_avx2(temp, out, 16);
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) {
+  __m256i step1[32], step2[32];
+
+  // stage 1
+  butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+  butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+  butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+  butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+  butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+  butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+  // stage 2
+  step2[16] = _mm256_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm256_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm256_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm256_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm256_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm256_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm256_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm256_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm256_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm256_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm256_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm256_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm256_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm256_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm256_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm256_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+              &step1[30]);
+  butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+              &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+              &step1[26]);
+  butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+              &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_16x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) {
+  __m256i temp[32];
+
+  // For each 16x32 block __m256i in[32],
+  // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+  // output pixels: 0-7 in __m256i out[32]
+  // AND
+  // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+  // output pixels: 8-15 in __m256i out[32]
+  idct32_1024_16x32_quarter_1_2(in, temp);
+
+  // For each 16x32 block __m256i in[32],
+  // Input with odd index,
+  // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+  // output pixels: 16-23, 24-31 in __m256i out[32]
+  idct32_1024_16x32_quarter_3_4(in, temp);
+
+  // final stage
+  add_sub_butterfly_avx2(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  __m256i l[32], r[32], out[32], *in;
+  int i;
+
+  in = l;
+
+  for (i = 0; i < 2; i++) {
+    idct_load16x16(input, in, 32);
+    transpose_16bit_16x16_avx2(in, in);
+
+    idct_load16x16(input + 16, in + 16, 32);
+    transpose_16bit_16x16_avx2(in + 16, in + 16);
+    idct32_1024_16x32(in, in);
+
+    in = r;
+    input += 32 << 4;
+  }
+
+  for (i = 0; i < 32; i += 16) {
+    transpose_16bit_16x16_avx2(l + i, out);
+    transpose_16bit_16x16_avx2(r + i, out + 16);
+    idct32_1024_16x32(out, out);
+
+    store_buffer_16x32(out, dest, stride);
+    dest += 16;
+  }
+}
+
+// Case when only upper-left 16x16 has non-zero coeff
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m256i in[32], io[32], out[32];
+  int i;
+
+  for (i = 16; i < 32; i++) {
+    in[i] = _mm256_setzero_si256();
+  }
+
+  // rows
+  idct_load16x16(input, in, 32);
+  transpose_16bit_16x16_avx2(in, in);
+  idct32_1024_16x32(in, io);
+
+  // columns
+  for (i = 0; i < 32; i += 16) {
+    transpose_16bit_16x16_avx2(io + i, in);
+    idct32_1024_16x32(in, out);
+
+    store_buffer_16x32(out, dest, stride);
+    dest += 16;
+  }
+}

From c85b7331a5960bbb4074658641482bb40c902354 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 4 May 2023 19:22:17 -0700
Subject: [PATCH 708/926] macros_msa.h: clear -Wshadow warnings

Bug: webm:1793
Change-Id: Ib2e3bd3c52632cdd4410cb2c54d69750e64e5201
---
 vpx_dsp/mips/macros_msa.h | 56 +++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index d54ce53684..53462b59f4 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -774,16 +774,16 @@
    Details     : 4 signed word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_SW_S32(in)                            \
-  ({                                               \
-    v2i64 res0_m, res1_m;                          \
-    int32_t sum_m;                                 \
-                                                   \
-    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
-    res1_m = __msa_splati_d(res0_m, 1);            \
-    res0_m = res0_m + res1_m;                      \
-    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
-    sum_m;                                         \
+#define HADD_SW_S32(in)                                               \
+  ({                                                                  \
+    v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m;                     \
+    int32_t hadd_sw_s32_sum_m;                                        \
+                                                                      \
+    hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);        \
+    hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1);       \
+    hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m;     \
+    hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \
+    hadd_sw_s32_sum_m;                                                \
   })
 
 /* Description : Horizontal addition of 4 unsigned word elements
@@ -793,16 +793,16 @@
    Details     : 4 unsigned word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_UW_U32(in)                               \
-  ({                                                  \
-    v2u64 res0_m, res1_m;                             \
-    uint32_t sum_m;                                   \
-                                                      \
-    res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
-    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
-    res0_m += res1_m;                                 \
-    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
-    sum_m;                                            \
+#define HADD_UW_U32(in)                                                       \
+  ({                                                                          \
+    v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m;                             \
+    uint32_t hadd_uw_u32_sum_m;                                               \
+                                                                              \
+    hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);                \
+    hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \
+    hadd_uw_u32_res0_m += hadd_uw_u32_res1_m;                                 \
+    hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0);         \
+    hadd_uw_u32_sum_m;                                                        \
   })
 
 /* Description : Horizontal addition of 8 unsigned halfword elements
@@ -812,14 +812,14 @@
    Details     : 8 unsigned halfword elements of 'in' vector are added
                  together and the resulting integer sum is returned
 */
-#define HADD_UH_U32(in)                           \
-  ({                                              \
-    v4u32 res_m;                                  \
-    uint32_t sum_m;                               \
-                                                  \
-    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
-    sum_m = HADD_UW_U32(res_m);                   \
-    sum_m;                                        \
+#define HADD_UH_U32(in)                                       \
+  ({                                                          \
+    v4u32 hadd_uh_u32_res_m;                                  \
+    uint32_t hadd_uh_u32_sum_m;                               \
+                                                              \
+    hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+    hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m);       \
+    hadd_uh_u32_sum_m;                                        \
   })
 
 /* Description : Horizontal addition of unsigned byte vector elements

From 28c5d70650356bf0c60c80d387608047bfdf6e09 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 May 2023 09:44:23 -0700
Subject: [PATCH 709/926] vp9_encoder: clear -Wshadow warning

with --enable-experimental --enable-rate-ctrl

Bug: webm:1793
Change-Id: I9ca664538bcf0c2aca8aea73283bbb0232eb86e9
---
 vp9/encoder/vp9_encoder.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 662ec24b83..5241f5b489 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4443,10 +4443,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
 
 #if CONFIG_RATE_CTRL
-  const FRAME_UPDATE_TYPE update_type =
-      cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
-  const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
-  RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type];
+  RATE_QSTEP_MODEL *rq_model;
+  {
+    const FRAME_UPDATE_TYPE update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
+    rq_model = &cpi->rq_model[frame_type];
+  }
   init_rq_history(rq_history);
 #endif  // CONFIG_RATE_CTRL
 

From b030d033b81378a8fe8b5f34eaee375acf323c58 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 May 2023 11:01:50 -0700
Subject: [PATCH 710/926] vp9,encoder_set_config: set setjmp flag after
 setjmp()

Change-Id: I6858e574d24aaff64f725404706f58e04e43717d
---
 vp9/vp9_cx_iface.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index e264ae9bd9..298112b084 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -813,6 +813,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
     assert(codec_err != VPX_CODEC_OK);
     return codec_err;
   }
+  ctx->cpi->common.error.setjmp = 1;
 
   ctx->cfg = *cfg;
   set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);

From eb7014c80c3219c15ee3e2abd3543588f96abd63 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 May 2023 11:03:19 -0700
Subject: [PATCH 711/926] vp9_decodeframe,tile_worker_hook: relocate setjmp=1

after the call to setjmp(); this is more correct and consistent with
other code.

Change-Id: I6d9bb8daad6a959bfe4f25484f9d6664b99da19e
---
 vp9/decoder/vp9_decodeframe.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6eae41fcfb..73420bacfb 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -2192,8 +2192,6 @@ static int tile_worker_hook(void *arg1, void *arg2) {
 
   volatile int mi_row = 0;
   volatile int n = tile_data->buf_start;
-  tile_data->error_info.setjmp = 1;
-
   if (setjmp(tile_data->error_info.jmp)) {
     tile_data->error_info.setjmp = 0;
     tile_data->xd.corrupted = 1;
@@ -2206,6 +2204,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     }
     return 0;
   }
+  tile_data->error_info.setjmp = 1;
 
   tile_data->xd.corrupted = 0;
 

From 851a76ff65905ea8961d8221a358ffb91c1f5f98 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 May 2023 11:04:25 -0700
Subject: [PATCH 712/926] vp8_[cd]x_iface: clear setjmp flag on function exit

in vp8e_encode, also move setting the setjmp() call closer to setting
the flag.

Change-Id: Ie165d4100b84776f9c34eddcf64657bd78cce4f5
---
 vp8/vp8_cx_iface.c | 16 ++++++++--------
 vp8/vp8_dx_iface.c |  2 ++
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index a9d1f8005d..0821eef026 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -911,12 +911,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
-  if (setjmp(ctx->cpi->common.error.jmp)) {
-    ctx->cpi->common.error.setjmp = 0;
-    vpx_clear_system_state();
-    return VPX_CODEC_CORRUPT_FRAME;
-  }
-
   /* Initialize the encoder instance on the first frame*/
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
@@ -927,6 +921,13 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     unsigned char *cx_data_end;
     int comp_data_state = 0;
 
+    if (setjmp(ctx->cpi->common.error.jmp)) {
+      ctx->cpi->common.error.setjmp = 0;
+      vpx_clear_system_state();
+      return VPX_CODEC_CORRUPT_FRAME;
+    }
+    ctx->cpi->common.error.setjmp = 1;
+
     /* Set up internal flags */
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) {
       ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
@@ -962,8 +963,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     cx_data_end = ctx->cx_data + cx_data_sz;
     lib_flags = 0;
 
-    ctx->cpi->common.error.setjmp = 1;
-
     while (cx_data_sz >= ctx->cx_data_sz / 2) {
       comp_data_state = vp8_get_compressed_data(
           ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp,
@@ -1059,6 +1058,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
         }
       }
     }
+    ctx->cpi->common.error.setjmp = 0;
   }
 
   return res;
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 55a77ba7e5..fdc0b35dd4 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -310,6 +310,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
     VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
     VP8_COMMON *const pc = &pbi->common;
     if (setjmp(pbi->common.error.jmp)) {
+      pbi->common.error.setjmp = 0;
       vp8_remove_decoder_instances(fb);
       vp8_zero(fb->pbi);
       vpx_clear_system_state();
@@ -494,6 +495,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
     /* get ready for the next series of fragments */
     ctx->fragments.count = 0;
+    pbi->common.error.setjmp = 0;
   }
 
   return res;

From 497f246d2925a2866644e542df158b4421ebab0d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 4 May 2023 19:23:20 -0700
Subject: [PATCH 713/926] sixtap_filter_msa.c: clear -Wshadow warnings

Bug: webm:1793
Change-Id: I5f9c09f31b06fecc123c6a9d01f5fbed39142356
---
 vp8/common/mips/msa/sixtap_filter_msa.c | 181 ++++++++++++++----------
 1 file changed, 107 insertions(+), 74 deletions(-)

diff --git a/vp8/common/mips/msa/sixtap_filter_msa.c b/vp8/common/mips/msa/sixtap_filter_msa.c
index b0affcff01..3a1bb7cd57 100644
--- a/vp8/common/mips/msa/sixtap_filter_msa.c
+++ b/vp8/common/mips/msa/sixtap_filter_msa.c
@@ -35,101 +35,134 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
                         filt_h2)                                           \
   ({                                                                       \
-    v16i8 vec0_m, vec1_m, vec2_m;                                          \
-    v8i16 hz_out_m;                                                        \
+    v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m;                        \
+    v8i16 _6tap_out_m;                                                     \
                                                                            \
     VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,    \
-               vec0_m, vec1_m, vec2_m);                                    \
-    hz_out_m =                                                             \
-        DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);   \
+               _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m);                  \
+    _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m,   \
+                               filt_h0, filt_h1, filt_h2);                 \
                                                                            \
-    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                 \
+    _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT);            \
+    _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7);                           \
                                                                            \
-    hz_out_m;                                                              \
+    _6tap_out_m;                                                           \
   })
 
 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
                                    mask2, filt0, filt1, filt2, out0, out1) \
   {                                                                        \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                  \
+    v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m,         \
+        _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m;           \
                                                                            \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                 \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                \
-    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
-    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m,    \
+               _6tap_4wid_vec1_m);                                         \
+    DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0,  \
+                out1);                                                     \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m,    \
+               _6tap_4wid_vec3_m);                                         \
+    DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \
+                 out1);                                                    \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m,    \
+               _6tap_4wid_vec5_m);                                         \
+    DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \
+                 out1);                                                    \
   }
 
-#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   mask2, filt0, filt1, filt2, out0, out1,   \
-                                   out2, out3)                               \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                out0, out1, out2, out3);                                     \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
-    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);        \
-    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
-                 out0, out1, out2, out3);                                    \
-    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
-                 out0, out1, out2, out3);                                    \
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   mask2, filt0, filt1, filt2, out0, out1,  \
+                                   out2, out3)                              \
+  {                                                                         \
+    v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,          \
+        _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m,            \
+        _6tap_8wid_vec6_m, _6tap_8wid_vec7_m;                               \
+                                                                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m,     \
+               _6tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m,     \
+               _6tap_8wid_vec3_m);                                          \
+    DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,    \
+                _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
+                out2, out3);                                                \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m,     \
+               _6tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m,     \
+               _6tap_8wid_vec3_m);                                          \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m,     \
+               _6tap_8wid_vec5_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m,     \
+               _6tap_8wid_vec7_m);                                          \
+    DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,   \
+                 _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+                 out2, out3);                                               \
+    DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m,   \
+                 _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \
+                 out2, out3);                                               \
   }
 
-#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)        \
-  ({                                                         \
-    v8i16 tmp0;                                              \
-                                                             \
-    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);        \
-    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
-                                                             \
-    tmp0;                                                    \
-  })
-
-#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)   \
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)                 \
   ({                                                                  \
-    v16i8 vec0_m, vec1_m;                                             \
-    v8i16 hz_out_m;                                                   \
-                                                                      \
-    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
-    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
+    v8i16 _4tap_dpadd_tmp0;                                           \
                                                                       \
-    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);             \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                            \
+    _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);     \
+    _4tap_dpadd_tmp0 =                                                \
+        __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \
                                                                       \
-    hz_out_m;                                                         \
+    _4tap_dpadd_tmp0;                                                 \
   })
 
-#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
-                                   filt0, filt1, out0, out1)             \
-  {                                                                      \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
-                                                                         \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);    \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);               \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);    \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);              \
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)        \
+  ({                                                                       \
+    v16i8 _4tap_vec0_m, _4tap_vec1_m;                                      \
+    v8i16 _4tap_out_m;                                                     \
+                                                                           \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m,         \
+               _4tap_vec1_m);                                              \
+    _4tap_out_m =                                                          \
+        FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \
+                                                                           \
+    _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT);            \
+    _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7);                           \
+                                                                           \
+    _4tap_out_m;                                                           \
+  })
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1)               \
+  {                                                                        \
+    v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m,         \
+        _4tap_4wid_vec3_m;                                                 \
+                                                                           \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m,    \
+               _4tap_4wid_vec1_m);                                         \
+    DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0,  \
+                out1);                                                     \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m,    \
+               _4tap_4wid_vec3_m);                                         \
+    DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \
+                 out1);                                                    \
   }
 
-#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   filt0, filt1, out0, out1, out2, out3)     \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                out0, out1, out2, out3);                                     \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
-    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
-                 out0, out1, out2, out3);                                    \
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   filt0, filt1, out0, out1, out2, out3)    \
+  {                                                                         \
+    v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,          \
+        _4tap_8wid_vec3_m;                                                  \
+                                                                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m,     \
+               _4tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m,     \
+               _4tap_8wid_vec3_m);                                          \
+    DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,    \
+                _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
+                out2, out3);                                                \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m,     \
+               _4tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m,     \
+               _4tap_8wid_vec3_m);                                          \
+    DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,   \
+                 _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+                 out2, out3);                                               \
   }
 
 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,

From 5636f098b38e0c013f802c1b411e2cf54d32b183 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 5 May 2023 13:40:50 -0400
Subject: [PATCH 714/926] Set setjmp flag in VP9 RTC rate control library

Change-Id: Ic5ec8dc7d9637091d4137a47d793cf29e76fdc45
---
 vp9/ratectrl_rtc.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 29033d4ba5..d92b095714 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -140,7 +140,16 @@ bool VP9RateControlRTC::UpdateRateControl(
   cpi_->framerate = rc_cfg.framerate;
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
+
   vp9_set_mb_mi(cm, cm->width, cm->height);
+
+  if (setjmp(cpi_->common.error.jmp)) {
+    cpi_->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    return false;
+  }
+  cpi_->common.error.setjmp = 1;
+
   for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
     oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
   }
@@ -168,6 +177,8 @@ bool VP9RateControlRTC::UpdateRateControl(
                                            (int)cpi_->oxcf.target_bandwidth);
   }
   vp9_check_reset_rc_flag(cpi_);
+
+  cpi_->common.error.setjmp = 0;
   return true;
 }
 

From 3d57fb69afbd9e299dc18620f23ec9deb0adfb54 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 May 2023 18:56:59 -0700
Subject: [PATCH 715/926] README: update target list

Change-Id: If2d5811a55f6bb60eeba7d28b69c78157a17e87f
---
 README | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README b/README
index e360df05f6..87e46f99d4 100644
--- a/README
+++ b/README
@@ -64,6 +64,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     arm64-android-gcc
     arm64-darwin-gcc
     arm64-darwin20-gcc
+    arm64-darwin21-gcc
+    arm64-darwin22-gcc
     arm64-linux-gcc
     arm64-win64-gcc
     arm64-win64-vs15
@@ -77,6 +79,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv7-win32-vs15
     armv7s-darwin-gcc
     armv8-linux-gcc
+    loongarch32-linux-gcc
+    loongarch64-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
     ppc64le-linux-gcc
@@ -117,6 +121,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-darwin18-gcc
     x86_64-darwin19-gcc
     x86_64-darwin20-gcc
+    x86_64-darwin21-gcc
+    x86_64-darwin22-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc

From b14d20b47004e58ee4100fc10ccf0cfa8dfe4fe6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 6 May 2023 15:48:58 -0700
Subject: [PATCH 716/926] examples.mk,vpxdec: rm libwebm muxer dependency

vpxdec only requires the parser.

Change-Id: I54ead453d4af400ca5c3412a3211d6d0b1383046
---
 examples.mk | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples.mk b/examples.mk
index 42886f1e15..9f83230eca 100644
--- a/examples.mk
+++ b/examples.mk
@@ -81,8 +81,6 @@ ifeq ($(CONFIG_LIBYUV),yes)
   $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS}
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
-  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
-  vpxdec.SRCS                 += $(LIBWEBM_MUXER_SRCS)
   vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
   vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif

From 75f9551efbef322b85534e91dd0b26c0c3bf18f4 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 8 May 2023 10:37:54 -0400
Subject: [PATCH 717/926] CHECK_MEM_ERROR to return in vp9_set_roi_map

Also change the return type of vp9_set_roi_map to vpx_codec_err_t

Change-Id: I60d9ff45f2d3dfc44cd6e2aab2cb1ba389ff15f3
---
 vp9/encoder/vp9_encoder.c | 18 ++++++++++--------
 vp9/encoder/vp9_encoder.h |  7 ++++---
 vp9/vp9_cx_iface.c        | 10 +++-------
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f76eec2b57..b54d0d5f73 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -681,9 +681,10 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }
 
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]) {
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]) {
   VP9_COMMON *cm = &cpi->common;
   vpx_roi_map_t *roi = &cpi->roi;
   const int range = 63;
@@ -694,13 +695,13 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
 
   // Check number of rows and columns match
   if (frame_rows != (int)rows || frame_cols != (int)cols) {
-    return -1;
+    return VPX_CODEC_INVALID_PARAM;
   }
 
   if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
       !check_seg_range(ref_frame, ref_frame_range) ||
       !check_seg_range(skip, skip_range))
-    return -1;
+    return VPX_CODEC_INVALID_PARAM;
 
   // Also disable segmentation if no deltas are specified.
   if (!map ||
@@ -714,14 +715,15 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
         ref_frame[6] == -1 && ref_frame[7] == -1))) {
     vp9_disable_segmentation(&cm->seg);
     cpi->roi.enabled = 0;
-    return 0;
+    return VPX_CODEC_OK;
   }
 
   if (roi->roi_map) {
     vpx_free(roi->roi_map);
     roi->roi_map = NULL;
   }
-  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+  roi->roi_map = vpx_malloc(rows * cols);
+  if (!roi->roi_map) return VPX_CODEC_MEM_ERROR;
 
   // Copy to ROI structure in the compressor.
   memcpy(roi->roi_map, map, rows * cols);
@@ -733,7 +735,7 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
   roi->rows = rows;
   roi->cols = cols;
 
-  return 0;
+  return VPX_CODEC_OK;
 }
 
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index cca1617830..8effe8741e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1457,9 +1457,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]);
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]);
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f067efdf79..7150f74284 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1633,13 +1633,9 @@ static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
 
   if (data) {
     vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
-    if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
-                         roi->delta_q, roi->delta_lf, roi->skip,
-                         roi->ref_frame)) {
-      return VPX_CODEC_OK;
-    }
-    return VPX_CODEC_INVALID_PARAM;
+    return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+                           roi->delta_q, roi->delta_lf, roi->skip,
+                           roi->ref_frame);
   }
   return VPX_CODEC_INVALID_PARAM;
 }

From 1710c9282a11aaaccdf42b7507f570f58e39b7fd Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 4 May 2023 22:03:27 -0400
Subject: [PATCH 718/926] Unify implementation of CHECK_MEM_ERROR

There were multiple implementations of CHECK_MEM_ERROR across the
library that take different arguments and used in different places.

This CL will unify them and have only one implementation that takes
vpx_internal_error_info.

Change-Id: I2c568639473815bc00b1fc2b72be56e5ccba1a35
---
 vp8/decoder/onyxd_int.h            | 21 ----------
 vp8/decoder/threading.c            | 26 ++++++------
 vp8/encoder/encodeframe.c          |  2 +-
 vp8/encoder/ethreading.c           | 10 ++---
 vp8/encoder/onyx_if.c              | 44 +++++++++++---------
 vp8/encoder/onyx_int.h             | 20 ---------
 vp9/common/vp9_common.h            | 21 ----------
 vp9/common/vp9_thread_common.c     | 16 +++----
 vp9/decoder/vp9_decodeframe.c      | 16 +++----
 vp9/decoder/vp9_decoder.c          | 19 +++++----
 vp9/encoder/vp9_bitstream.c        |  4 +-
 vp9/encoder/vp9_context_tree.c     | 15 +++----
 vp9/encoder/vp9_denoiser.c         |  4 +-
 vp9/encoder/vp9_encodeframe.c      |  6 +--
 vp9/encoder/vp9_encoder.c          | 67 +++++++++++++++---------------
 vp9/encoder/vp9_encoder.h          | 10 ++---
 vp9/encoder/vp9_ethread.c          | 14 +++----
 vp9/encoder/vp9_firstpass.c        |  2 +-
 vp9/encoder/vp9_mbgraph.c          |  2 +-
 vp9/encoder/vp9_multi_thread.c     |  4 +-
 vp9/encoder/vp9_speed_features.c   |  6 +--
 vp9/encoder/vp9_svc_layercontext.c |  6 +--
 vp9/encoder/vp9_tpl_model.c        | 10 ++---
 vpx/internal/vpx_codec_internal.h  | 23 ++++++++++
 24 files changed, 171 insertions(+), 197 deletions(-)

diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index a6bedc4faf..56500a8506 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -135,27 +135,6 @@ int vp8_decode_frame(VP8D_COMP *pbi);
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
 int vp8_remove_decoder_instances(struct frame_buffers *fb);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    assert(pbi->common.error.setjmp);                                       \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    assert(pbi->common.error.setjmp);                             \
-    (lval) = (expr);                                              \
-    if (!(lval))                                                  \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 490f62d1b3..9ea6a4f34a 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -30,11 +30,13 @@
 #include "error_concealment.h"
 #endif
 
-#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
-#define CALLOC_ARRAY_ALIGNED(p, n, algn)                            \
-  do {                                                              \
-    CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
-    memset((p), 0, (n) * sizeof(*(p)));                             \
+#define CALLOC_ARRAY(p, n) \
+  CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn)                       \
+  do {                                                         \
+    CHECK_MEM_ERROR(&pbi->common.error, (p),                   \
+                    vpx_memalign((algn), sizeof(*(p)) * (n))); \
+    memset((p), 0, (n) * sizeof(*(p)));                        \
   } while (0)
 
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -754,7 +756,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     uv_width = width >> 1;
 
     /* Allocate a vpx_atomic_int for each mb row. */
-    CHECK_MEM_ERROR(pbi->mt_current_mb_col,
+    CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col,
                     vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
     for (i = 0; i < pc->mb_rows; ++i)
       vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
@@ -762,7 +764,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (width + (VP8BORDERINPIXELS << 1))));
       vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1));
@@ -770,7 +772,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
       vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -778,7 +780,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
       vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -787,17 +789,17 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 16, 1));
 
     CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
 
     CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
   }
 }
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 620107500a..dc29945729 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -123,7 +123,7 @@ static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) {
     unsigned int tmp;
 
     /* Create a list to sort to */
-    CHECK_MEM_ERROR(sortlist,
+    CHECK_MEM_ERROR(&cpi->common.error, sortlist,
                     vpx_calloc(sizeof(unsigned int), cpi->common.MBs));
 
     /* Copy map to sort list */
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index cb35f4f491..2583cb0ac3 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -510,16 +510,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
     if (th_count == 0) return 0;
 
-    CHECK_MEM_ERROR(cpi->h_encoding_thread,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
                     vpx_malloc(sizeof(pthread_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_start_encoding,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
                     vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_end_encoding,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
                     vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->mb_row_ei,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
                     vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
     memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
-    CHECK_MEM_ERROR(cpi->en_thread_data,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data,
                     vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 
     vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index a780048073..8941329419 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1169,7 +1169,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 #else
     unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
 #endif
-    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->tok,
+                    vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
   /* Data used for real time vc mode to see if gf needs refreshing */
@@ -1178,37 +1179,39 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   /* Structures used to monitor GF usage */
   vpx_free(cpi->gf_active_flags);
   CHECK_MEM_ERROR(
-      cpi->gf_active_flags,
+      &cpi->common.error, cpi->gf_active_flags,
       vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols));
   cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(
-      cpi->mb_activity_map,
+      &cpi->common.error, cpi->mb_activity_map,
       vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols));
 
   /* allocate memory for storing last frame's MVs for MV prediction. */
   vpx_free(cpi->lfmv);
-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
-                                        sizeof(*cpi->lfmv)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->lfmv,
+      vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv)));
   vpx_free(cpi->lf_ref_frame_sign_bias);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame_sign_bias)));
   vpx_free(cpi->lf_ref_frame);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame)));
 
   /* Create the encoder segmentation map and set all entries to 0 */
   vpx_free(cpi->segmentation_map);
   CHECK_MEM_ERROR(
-      cpi->segmentation_map,
+      &cpi->common.error, cpi->segmentation_map,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map)));
   cpi->cyclic_refresh_mode_index = 0;
   vpx_free(cpi->active_map);
-  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                              sizeof(*cpi->active_map)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->active_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map)));
   memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols));
 
 #if CONFIG_MULTITHREAD
@@ -1226,7 +1229,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
     int i;
 
     vpx_free(cpi->mt_current_mb_col);
-    CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->mt_current_mb_col,
                     vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
     for (i = 0; i < cm->mb_rows; ++i)
       vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
@@ -1235,7 +1238,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 #endif
 
   vpx_free(cpi->tplist);
-  CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist,
+                  vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
 
 #if CONFIG_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1773,8 +1777,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->common.error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site),
-                                         (MAX_MVSEARCH_STEPS * 8) + 1));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->mb.ss,
+      vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
 
   vp8_create_common(&cpi->common);
 
@@ -1879,18 +1884,19 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   }
 
   if (cpi->cyclic_refresh_mode_enabled) {
-    CHECK_MEM_ERROR(cpi->cyclic_refresh_map,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map,
                     vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
   } else {
     cpi->cyclic_refresh_map = (signed char *)NULL;
   }
 
-  CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                            sizeof(cpi->skin_map[0])));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->skin_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0])));
 
-  CHECK_MEM_ERROR(cpi->consec_zero_last,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last,
                   vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
-  CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias,
                   vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
   /*Initialize the feed-forward activity masking.*/
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 46a17913ad..bde5c2f69b 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -731,26 +731,6 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    assert(cpi->common.error.setjmp);                                       \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    assert(cpi->common.error.setjmp);                             \
-    (lval) = (expr);                                              \
-    if (!(lval))                                                  \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 8d2bed38e5..d63bad93d1 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -46,27 +46,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(cm, lval, expr)                                     \
-  do {                                                                      \
-    assert(&(cm)->error.setjmp);                                            \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,                 \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(cm, lval, expr)                     \
-  do {                                                      \
-    assert(&(cm)->error.setjmp);                            \
-    (lval) = (expr);                                        \
-    if (!(lval))                                            \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);      \
-  } while (0)
-#endif
-
 #define VP9_SYNC_CODE_0 0x49
 #define VP9_SYNC_CODE_1 0x83
 #define VP9_SYNC_CODE_2 0x42
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index ad4478179e..1c6ecc0fe6 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -283,7 +283,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, lf_sync->mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->mutex,
                     vpx_malloc(sizeof(*lf_sync->mutex) * rows));
     if (lf_sync->mutex) {
       for (i = 0; i < rows; ++i) {
@@ -291,7 +291,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->cond,
                     vpx_malloc(sizeof(*lf_sync->cond) * rows));
     if (lf_sync->cond) {
       for (i = 0; i < rows; ++i) {
@@ -299,11 +299,11 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->lf_mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex,
                     vpx_malloc(sizeof(*lf_sync->lf_mutex)));
     pthread_mutex_init(lf_sync->lf_mutex, NULL);
 
-    CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex,
                     vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
     if (lf_sync->recon_done_mutex) {
       for (i = 0; i < rows; ++i) {
@@ -311,7 +311,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond,
                     vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
     if (lf_sync->recon_done_cond) {
       for (i = 0; i < rows; ++i) {
@@ -321,15 +321,15 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata,
                   vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
   lf_sync->num_workers = num_workers;
   lf_sync->num_active_workers = lf_sync->num_workers;
 
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
-  CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done,
                   vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
                                  mi_cols_aligned_to_sb(cm->mi_rows) >>
                              MI_BLOCK_SIZE_LOG2));
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6eae41fcfb..10a7f9b124 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1469,7 +1469,7 @@ static void resize_mv_buffer(VP9_COMMON *cm) {
   vpx_free(cm->cur_frame->mvs);
   cm->cur_frame->mi_rows = cm->mi_rows;
   cm->cur_frame->mi_cols = cm->mi_cols;
-  CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+  CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs,
                   (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
                                        sizeof(*cm->cur_frame->mvs)));
 }
@@ -1776,7 +1776,8 @@ static void vp9_jobq_alloc(VP9Decoder *pbi) {
 
   if (jobq_size > row_mt_worker_data->jobq_size) {
     vpx_free(row_mt_worker_data->jobq_buf);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf,
+                    vpx_calloc(1, jobq_size));
     vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
                   jobq_size);
     row_mt_worker_data->jobq_size = jobq_size;
@@ -1923,7 +1924,7 @@ static int row_decode_worker_hook(void *arg1, void *arg2) {
       const int is_last_row = sb_rows - 1 == cur_sb_row;
       int mi_col_start, mi_col_end;
       if (!tile_data_recon)
-        CHECK_MEM_ERROR(cm, tile_data_recon,
+        CHECK_MEM_ERROR(&cm->error, tile_data_recon,
                         vpx_memalign(32, sizeof(TileWorkerData)));
 
       tile_data_recon->xd = pbi->mb;
@@ -2025,7 +2026,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
       pbi->lf_worker.data1 == NULL) {
-    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+    CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
     pbi->lf_worker.hook = vp9_loop_filter_worker;
     if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
@@ -2285,7 +2286,7 @@ static INLINE void init_mt(VP9Decoder *pbi) {
 
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads;
-    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
     for (n = 0; n < num_threads; ++n) {
       VPxWorker *const worker = &pbi->tile_workers[n];
@@ -2824,7 +2825,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     const int num_jobs = sb_rows << cm->log2_tile_cols;
 
     if (pbi->row_mt_worker_data == NULL) {
-      CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
+      CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data,
                       vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
 #if CONFIG_MULTITHREAD
       pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
@@ -3006,7 +3007,8 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     // platforms without DECLARE_ALIGNED().
     assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
     vpx_free(pbi->tile_worker_data);
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size));
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data,
+                    vpx_memalign(32, twd_size));
     pbi->total_tiles = tile_rows * tile_cols;
   }
 
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 92cd91f1e3..5a7e9f9ab3 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -66,7 +66,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
   {
     int i;
     CHECK_MEM_ERROR(
-        cm, row_mt_worker_data->recon_sync_mutex,
+        &cm->error, row_mt_worker_data->recon_sync_mutex,
         vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
     if (row_mt_worker_data->recon_sync_mutex) {
       for (i = 0; i < num_jobs; ++i) {
@@ -75,7 +75,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
     }
 
     CHECK_MEM_ERROR(
-        cm, row_mt_worker_data->recon_sync_cond,
+        &cm->error, row_mt_worker_data->recon_sync_cond,
         vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
     if (row_mt_worker_data->recon_sync_cond) {
       for (i = 0; i < num_jobs; ++i) {
@@ -86,24 +86,24 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
 #endif
   row_mt_worker_data->num_sbs = num_sbs;
   for (plane = 0; plane < 3; ++plane) {
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane],
                     vpx_memalign(32, dqcoeff_size));
     memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane],
                     vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
                                sizeof(*row_mt_worker_data->eob[plane])));
   }
-  CHECK_MEM_ERROR(cm, row_mt_worker_data->partition,
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition,
                   vpx_calloc(num_sbs * PARTITIONS_PER_SB,
                              sizeof(*row_mt_worker_data->partition)));
-  CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map,
                   vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
 
   // allocate memory for thread_data
   if (row_mt_worker_data->thread_data == NULL) {
     const size_t thread_size =
         max_threads * sizeof(*row_mt_worker_data->thread_data);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data,
                     vpx_memalign(32, thread_size));
   }
 }
@@ -181,9 +181,10 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
 
   cm->error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   pbi->need_resync = 1;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 17c123af6f..ca56d14aa1 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -967,13 +967,13 @@ static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
   int i;
   const size_t worker_data_size =
       cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
-  CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data,
+  CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data,
                   vpx_memalign(16, worker_data_size));
   memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
   for (i = 1; i < cpi->num_workers; ++i) {
     cpi->vp9_bitstream_worker_data[i].dest_size =
         cpi->oxcf.width * cpi->oxcf.height;
-    CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest,
+    CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest,
                     vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
   }
 }
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index b74b9027ca..42073f756c 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -25,16 +25,17 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
   int i, k;
   ctx->num_4x4_blk = num_blk;
 
-  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t)));
+  CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk,
+                  vpx_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
-      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k],
                       vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
@@ -100,10 +101,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
   int nodes;
 
   vpx_free(td->leaf_tree);
-  CHECK_MEM_ERROR(cm, td->leaf_tree,
+  CHECK_MEM_ERROR(&cm->error, td->leaf_tree,
                   vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
   vpx_free(td->pc_tree);
-  CHECK_MEM_ERROR(cm, td->pc_tree,
+  CHECK_MEM_ERROR(&cm->error, td->pc_tree,
                   vpx_calloc(tree_nodes, sizeof(*td->pc_tree)));
 
   this_pc = &td->pc_tree[0];
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 77d72396ae..baea8ebb3c 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -634,11 +634,11 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
   denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
   init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
   denoiser->num_layers = num_layers;
-  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+  CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y,
                   vpx_calloc(denoiser->num_ref_frames * num_layers,
                              sizeof(denoiser->running_avg_y[0])));
   CHECK_MEM_ERROR(
-      cm, denoiser->mc_running_avg_y,
+      &cm->error, denoiser->mc_running_avg_y,
       vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
 
   for (layer = 0; layer < num_layers; ++layer) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3a042399cb..a979ae1c93 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1545,7 +1545,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
+    CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2)));
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (i = 0; i < 4; i++) {
@@ -5783,7 +5783,7 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
     if (cm->last_width != cm->width || cm->last_height != cm->height) {
       if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
 
-      CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+      CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
                       vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
     }
 
@@ -5823,7 +5823,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
     CHECK_MEM_ERROR(
-        cm, cpi->tile_data,
+        &cm->error, cpi->tile_data,
         vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
     cpi->allocated_tiles = tile_cols * tile_rows;
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b54d0d5f73..db21509ce9 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1376,7 +1376,7 @@ static void alloc_context_buffers_ext(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int mi_size = cm->mi_cols * cm->mi_rows;
 
-  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+  CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base,
                   vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
@@ -1395,14 +1395,14 @@ static void alloc_compressor_data(VP9_COMP *cpi) {
 
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+    CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0],
                     vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
   sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   vpx_free(cpi->tplist[0][0]);
   CHECK_MEM_ERROR(
-      cm, cpi->tplist[0][0],
+      &cm->error, cpi->tplist[0][0],
       vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
 
   vp9_setup_pc_tree(&cpi->common, &cpi->td);
@@ -1998,48 +1998,48 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) {
 
   // Create the encoder segmentation map and set all entries to 0
   vpx_free(cpi->segmentation_map);
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+  CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // Create a map used for cyclic background refresh.
   if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh);
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+  CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh,
                   vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
 
   // Create a map used to mark inactive areas.
   vpx_free(cpi->active_map.map);
-  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+  CHECK_MEM_ERROR(&cm->error, cpi->active_map.map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+  CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
 static void alloc_copy_partition_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cpi->prev_partition == NULL) {
-    CHECK_MEM_ERROR(cm, cpi->prev_partition,
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_partition,
                     (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
                                              sizeof(*cpi->prev_partition)));
   }
   if (cpi->prev_segment_id == NULL) {
     CHECK_MEM_ERROR(
-        cm, cpi->prev_segment_id,
+        &cm->error, cpi->prev_segment_id,
         (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                              sizeof(*cpi->prev_segment_id)));
   }
   if (cpi->prev_variance_low == NULL) {
-    CHECK_MEM_ERROR(cm, cpi->prev_variance_low,
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low,
                     (uint8_t *)vpx_calloc(
                         (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
                         sizeof(*cpi->prev_variance_low)));
   }
   if (cpi->copied_frame_cnt == NULL) {
     CHECK_MEM_ERROR(
-        cm, cpi->copied_frame_cnt,
+        &cm->error, cpi->copied_frame_cnt,
         (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                               sizeof(*cpi->copied_frame_cnt)));
   }
@@ -2372,9 +2372,10 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   cm->free_mi = vp9_enc_free_mi;
   cm->setup_mi = vp9_enc_setup_mi;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   cpi->compute_frame_low_motion_onepass = 1;
@@ -2401,38 +2402,38 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   realloc_segmentation_maps(cpi);
 
   CHECK_MEM_ERROR(
-      cm, cpi->skin_map,
+      &cm->error, cpi->skin_map,
       vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
 
 #if !CONFIG_REALTIME_ONLY
-  CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+  CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
 #endif
 
   CHECK_MEM_ERROR(
-      cm, cpi->consec_zero_mv,
+      &cm->error, cpi->consec_zero_mv,
       vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
 
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
     CHECK_MEM_ERROR(
-        cm, cpi->mbgraph_stats[i].mb_stats,
+        &cm->error, cpi->mbgraph_stats[i].mb_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
@@ -2476,7 +2477,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   }
 
   if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+    CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars,
                     vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
                                sizeof(*cpi->ssim_vars) * 4));
     cpi->worst_consistency = 100.0;
@@ -2561,7 +2562,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
           vpx_free(lc->rc_twopass_stats_in.buf);
 
           lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
-          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+          CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf,
                           vpx_malloc(lc->rc_twopass_stats_in.sz));
           lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
           lc->twopass.stats_in = lc->twopass.stats_in_start;
@@ -2616,7 +2617,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
     const int h = num_8x8_blocks_high_lookup[bsize];
     const int num_cols = (cm->mi_cols + w - 1) / w;
     const int num_rows = (cm->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors,
+    CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors,
                     vpx_calloc(num_rows * num_cols,
                                sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
   }
@@ -2631,7 +2632,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   }
 
   // Allocate memory to store variances for a frame.
-  CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+  CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
                   vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
@@ -3754,7 +3755,7 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
       case 6: l = 150; break;
     }
     if (!cpi->common.postproc_state.limits) {
-      CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits,
+      CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits,
                       vpx_calloc(cpi->un_scaled_source->y_width,
                                  sizeof(*cpi->common.postproc_state.limits)));
     }
@@ -4098,7 +4099,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
       svc->spatial_layer_id == svc->number_spatial_layers - 2) {
     if (svc->prev_partition_svc == NULL) {
       CHECK_MEM_ERROR(
-          cm, svc->prev_partition_svc,
+          &cm->error, svc->prev_partition_svc,
           (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
                                    sizeof(*svc->prev_partition_svc)));
     }
@@ -5297,7 +5298,7 @@ static void init_mb_wiener_var_buffer(VP9_COMP *cpi) {
   cpi->mb_wiener_variance = NULL;
 
   CHECK_MEM_ERROR(
-      cm, cpi->mb_wiener_variance,
+      &cm->error, cpi->mb_wiener_variance,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
   cpi->mb_wiener_var_rows = cm->mb_rows;
   cpi->mb_wiener_var_cols = cm->mb_cols;
@@ -6544,7 +6545,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     pthread_mutex_init(&cpi->kmeans_mutex, NULL);
 #endif
     CHECK_MEM_ERROR(
-        cm, cpi->kmeans_data_arr,
+        &cm->error, cpi->kmeans_data_arr,
         vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
     cpi->kmeans_data_stride = mi_cols;
     cpi->kmeans_data_arr_alloc = 1;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 8effe8741e..230a8315bc 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1060,7 +1060,7 @@ static INLINE void partition_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->partition_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->partition_info,
                   (PARTITION_INFO *)vpx_calloc(unit_width * unit_height,
                                                sizeof(PARTITION_INFO)));
   memset(cpi->partition_info, 0,
@@ -1088,7 +1088,7 @@ static INLINE void motion_vector_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->motion_vector_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info,
                   (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
                                                    sizeof(MOTION_VECTOR_INFO)));
   memset(cpi->motion_vector_info, 0,
@@ -1107,7 +1107,7 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) {
 static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   CHECK_MEM_ERROR(
-      cm, cpi->tpl_stats_info,
+      &cm->error, cpi->tpl_stats_info,
       (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats)));
   memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats));
 }
@@ -1126,7 +1126,7 @@ static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info,
                   (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
                                                    sizeof(MOTION_VECTOR_INFO)));
 }
@@ -1475,7 +1475,7 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
   if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
       new_fb_ptr->mi_cols < cm->mi_cols) {
     vpx_free(new_fb_ptr->mvs);
-    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+    CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs,
                     (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
                                          sizeof(*new_fb_ptr->mvs)));
     new_fb_ptr->mi_rows = cm->mi_rows;
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 453fe2e0df..fadd233899 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -94,10 +94,10 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
   vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
   vp9_encode_free_mt_data(cpi);
 
-  CHECK_MEM_ERROR(cm, cpi->workers,
+  CHECK_MEM_ERROR(&cm->error, cpi->workers,
                   vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
-  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+  CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data,
                   vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 
   for (i = 0; i < num_workers; i++) {
@@ -111,7 +111,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
       thread_data->cpi = cpi;
 
       // Allocate thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td,
+      CHECK_MEM_ERROR(&cm->error, thread_data->td,
                       vpx_memalign(32, sizeof(*thread_data->td)));
       vp9_zero(*thread_data->td);
 
@@ -121,7 +121,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
       vp9_setup_pc_tree(cm, thread_data->td);
 
       // Allocate frame counters in thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+      CHECK_MEM_ERROR(&cm->error, thread_data->td->counts,
                       vpx_calloc(1, sizeof(*thread_data->td->counts)));
 
       // Create threads
@@ -292,7 +292,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->mutex,
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex,
                     vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
     if (row_mt_sync->mutex) {
       for (i = 0; i < rows; ++i) {
@@ -300,7 +300,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
       }
     }
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->cond,
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond,
                     vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
     if (row_mt_sync->cond) {
       for (i = 0; i < rows; ++i) {
@@ -310,7 +310,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+  CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col,
                   vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
 
   // Set up nsync.
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 71d8775ea5..8fdd976816 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1422,7 +1422,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
     CHECK_MEM_ERROR(
-        cm, cpi->twopass.fp_mb_float_stats,
+        &cm->error, cpi->twopass.fp_mb_float_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
 
   {
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 9487fc5fae..fafc673aca 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -288,7 +288,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
   int *arf_not_zz;
 
   CHECK_MEM_ERROR(
-      cm, arf_not_zz,
+      &cm->error, arf_not_zz,
       vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
 
   // We are not interested in results beyond the alt ref itself.
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index 45659f2a9a..0843cd97e4 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -59,7 +59,7 @@ void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
   int i;
 
   CHECK_MEM_ERROR(
-      cm, this_tile->row_base_thresh_freq_fact,
+      &cm->error, this_tile->row_base_thresh_freq_fact,
       (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
                         sizeof(*(this_tile->row_base_thresh_freq_fact))));
   for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
@@ -85,7 +85,7 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
   multi_thread_ctxt->allocated_tile_rows = tile_rows;
   multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
 
-  CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue,
+  CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue,
                   (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
 
 #if CONFIG_MULTITHREAD
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 60720e3ea6..48c21c581e 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -676,7 +676,7 @@ static void set_rt_speed_feature_framesize_independent(
       if (cpi->content_state_sb_fd == NULL &&
           (!cpi->use_svc ||
            svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
-        CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd,
+        CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd,
                         (uint8_t *)vpx_calloc(
                             (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                             sizeof(uint8_t)));
@@ -832,13 +832,13 @@ static void set_rt_speed_feature_framesize_independent(
     }
     if (cpi->count_arf_frame_usage == NULL) {
       CHECK_MEM_ERROR(
-          cm, cpi->count_arf_frame_usage,
+          &cm->error, cpi->count_arf_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                                 sizeof(*cpi->count_arf_frame_usage)));
     }
     if (cpi->count_lastgolden_frame_usage == NULL)
       CHECK_MEM_ERROR(
-          cm, cpi->count_lastgolden_frame_usage,
+          &cm->error, cpi->count_lastgolden_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                                 sizeof(*cpi->count_lastgolden_frame_usage)));
   }
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index f08d668203..e4721271d9 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -163,17 +163,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
         lc->counter_encode_maxq_scene_change = 0;
-        CHECK_MEM_ERROR(cm, lc->map,
+        CHECK_MEM_ERROR(&cm->error, lc->map,
                         vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
         last_coded_q_map_size =
             mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+        CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
                         vpx_malloc(last_coded_q_map_size));
         assert(MAXQ <= 255);
         memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
         consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
-        CHECK_MEM_ERROR(cm, lc->consec_zero_mv,
+        CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
                         vpx_malloc(consec_zero_mv_size));
         memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
       }
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index ed771dcb4b..de3783f9a5 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -1320,7 +1320,7 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) {
 
   vpx_free(cpi->select_mv_arr);
   CHECK_MEM_ERROR(
-      cm, cpi->select_mv_arr,
+      &cm->error, cpi->select_mv_arr,
       vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
 #endif
 
@@ -1335,23 +1335,23 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) {
     for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
       vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
       CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+          &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
           vpx_calloc(mi_rows * mi_cols * 4,
                      sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
       vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
       CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+          &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
           vpx_calloc(mi_rows * mi_cols * 4,
                      sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
     }
 #endif
     vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
+    CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
                     vpx_calloc(mi_rows * mi_cols,
                                sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
     vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
     CHECK_MEM_ERROR(
-        cm, cpi->tpl_frame_stats[frame].block_stats_list,
+        &cm->error, cpi->tpl_frame_stats[frame].block_stats_list,
         vpx_calloc(mi_rows * mi_cols,
                    sizeof(*cpi->tpl_frame_stats[frame].block_stats_list)));
     cpi->tpl_frame_stats[frame].num_blocks = mi_rows * mi_cols;
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 670fe380ed..aae3218738 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -48,6 +48,8 @@
 #include "../vpx_encoder.h"
 #include <stdarg.h>
 
+#include "vpx_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -427,6 +429,27 @@ struct vpx_internal_error_info {
   jmp_buf jmp;
 };
 
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(error, lval, expr)                                  \
+  do {                                                                      \
+    assert((error)->setjmp);                                                \
+    (lval) = (expr);                                                        \
+    if (!(lval))                                                            \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,                        \
+                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
+                         __LINE__);                                         \
+  } while (0)
+#else
+#define CHECK_MEM_ERROR(error, lval, expr)             \
+  do {                                                 \
+    assert((error)->setjmp);                           \
+    (lval) = (expr);                                   \
+    if (!(lval))                                       \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,   \
+                         "Failed to allocate " #lval); \
+  } while (0)
+#endif
+
 #define CLANG_ANALYZER_NORETURN
 #if defined(__has_feature)
 #if __has_feature(attribute_analyzer_noreturn)

From 745c6392f795f43138cfca164b9e54ef895f87b9 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 4 May 2023 14:28:29 -0400
Subject: [PATCH 719/926] Add VpxTplGopStats

Contains the size of GOP - also the size of the list of TPL stats for
each frame in this GOP.

VpxTplGopStats will be the unit for VP9E_GET_TPL_STATS control to return
TPL stats from the encoder.

Bug: b/273736974
Change-Id: I1682242fc6db4aafcd6314af023aa0d704976585
---
 test/encode_api_test.cc     | 34 +++++++++++++++------------
 test/encode_test_driver.h   |  2 +-
 vp9/encoder/vp9_encoder.c   |  1 -
 vp9/encoder/vp9_encoder.h   |  2 +-
 vp9/encoder/vp9_tpl_model.c | 47 +++++++++++++++++++++++++++----------
 vp9/vp9_cx_iface.c          | 12 +++++-----
 vpx/vpx_encoder.h           |  6 +++++
 7 files changed, 68 insertions(+), 36 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index e435ed872f..2b0aa1fdfe 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -384,12 +384,15 @@ class EncodeApiGetTplStatsTest
     }
   }
 
-  vpx_codec_err_t AllocateTplList(VpxTplFrameStats **data) {
-    // Allocate MAX_ARF_GOP_SIZE * sizeof(VpxTplFrameStats) that will be filled
-    // by VP9E_GET_TPL_STATS
-    *data =
+  vpx_codec_err_t AllocateTplList(VpxTplGopStats *data) {
+    // Allocate MAX_ARF_GOP_SIZE (50) * sizeof(VpxTplFrameStats) that will be
+    // filled by VP9E_GET_TPL_STATS.
+    // MAX_ARF_GOP_SIZE is used here because the test doesn't know the size of
+    // each GOP before getting TPL stats from the encoder.
+    data->size = 50;
+    data->frame_stats_list =
         static_cast<VpxTplFrameStats *>(calloc(50, sizeof(VpxTplFrameStats)));
-    if (*data == nullptr) return VPX_CODEC_MEM_ERROR;
+    if (data->frame_stats_list == nullptr) return VPX_CODEC_MEM_ERROR;
     return VPX_CODEC_OK;
   }
 
@@ -398,22 +401,23 @@ class EncodeApiGetTplStatsTest
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
       switch (pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT: {
-          VpxTplFrameStats *tpl_stats = NULL;
+          VpxTplGopStats tpl_stats;
           EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK);
-          encoder->Control(VP9E_GET_TPL_STATS, tpl_stats);
+          encoder->Control(VP9E_GET_TPL_STATS, &tpl_stats);
           bool stats_not_all_zero = false;
-          for (unsigned int i = 0; i < cfg_.g_lag_in_frames; i++) {
-            if (tpl_stats[i].frame_width != 0) {
-              ASSERT_EQ(tpl_stats[i].frame_width, width_);
-              ASSERT_EQ(tpl_stats[i].frame_height, height_);
-              ASSERT_GT(tpl_stats[i].num_blocks, 0);
-              ASSERT_NE(tpl_stats[i].block_stats_list, nullptr);
+          for (int i = 0; i < tpl_stats.size; i++) {
+            VpxTplFrameStats *frame_stats_list = tpl_stats.frame_stats_list;
+            if (frame_stats_list[i].frame_width != 0) {
+              ASSERT_EQ(frame_stats_list[i].frame_width, width_);
+              ASSERT_EQ(frame_stats_list[i].frame_height, height_);
+              ASSERT_GT(frame_stats_list[i].num_blocks, 0);
+              ASSERT_NE(frame_stats_list[i].block_stats_list, nullptr);
               stats_not_all_zero = true;
             }
           }
           ASSERT_TRUE(stats_not_all_zero);
           // Free the memory right away now as this is only a test.
-          free(tpl_stats);
+          free(tpl_stats.frame_stats_list);
           break;
         }
         default: break;
@@ -430,7 +434,7 @@ TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
   width_ = 352;
   height_ = 288;
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
-                                       height_, 30, 1, 0, 150);
+                                       height_, 30, 1, 0, 50);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index a5cd8306ef..922c49f420 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -154,7 +154,7 @@ class Encoder {
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
-  void Control(int ctrl_id, VpxTplFrameStats *arg) {
+  void Control(int ctrl_id, VpxTplGopStats *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index db21509ce9..5126a971a1 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2628,7 +2628,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 #endif  // CONFIG_NON_GREEDY_MV
   for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
     cpi->tpl_stats[i].tpl_stats_ptr = NULL;
-    cpi->tpl_frame_stats[i].block_stats_list = NULL;
   }
 
   // Allocate memory to store variances for a frame.
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 230a8315bc..2528bc2316 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -745,7 +745,7 @@ typedef struct VP9_COMP {
   BLOCK_SIZE tpl_bsize;
   TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
   // Used to store TPL stats before propagation
-  VpxTplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE];
+  VpxTplGopStats tpl_gop_stats;
   YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
   EncFrameBuf enc_frame_buf[REF_FRAMES];
 #if CONFIG_MULTITHREAD
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index de3783f9a5..9f4bafdf83 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -154,17 +154,43 @@ static void init_tpl_stats(VP9_COMP *cpi) {
   int frame_idx;
   for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
     TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    VpxTplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx];
     memset(tpl_frame->tpl_stats_ptr, 0,
            tpl_frame->height * tpl_frame->width *
                sizeof(*tpl_frame->tpl_stats_ptr));
-    memset(tpl_frame_stats->block_stats_list, 0,
-           tpl_frame->height * tpl_frame->width *
-               sizeof(*tpl_frame_stats->block_stats_list));
     tpl_frame->is_valid = 0;
   }
 }
 
+static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) {
+    vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list);
+  }
+  vpx_free(tpl_gop_stats->frame_stats_list);
+}
+
+static void init_tpl_stats_before_propagation(
+    struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
+    TplDepFrame *tpl_stats, int tpl_gop_frames) {
+  int frame_idx;
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  CHECK_MEM_ERROR(
+      error_info, tpl_gop_stats->frame_stats_list,
+      vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list)));
+  tpl_gop_stats->size = tpl_gop_frames;
+  for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) {
+    const int mi_rows = tpl_stats[frame_idx].height;
+    const int mi_cols = tpl_stats[frame_idx].width;
+    CHECK_MEM_ERROR(
+        error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list,
+        vpx_calloc(
+            mi_rows * mi_cols,
+            sizeof(
+                *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
+    tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+  }
+}
+
 #if CONFIG_NON_GREEDY_MV
 static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
                                          MotionField *motion_field,
@@ -1106,7 +1132,7 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
                               int frame_idx, BLOCK_SIZE bsize) {
   TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
   VpxTplFrameStats *tpl_frame_stats_before_propagation =
-      &cpi->tpl_frame_stats[frame_idx];
+      &cpi->tpl_gop_stats.frame_stats_list[frame_idx];
   YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
   YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
 
@@ -1349,12 +1375,6 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) {
     CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
                     vpx_calloc(mi_rows * mi_cols,
                                sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
-    vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
-    CHECK_MEM_ERROR(
-        &cm->error, cpi->tpl_frame_stats[frame].block_stats_list,
-        vpx_calloc(mi_rows * mi_cols,
-                   sizeof(*cpi->tpl_frame_stats[frame].block_stats_list)));
-    cpi->tpl_frame_stats[frame].num_blocks = mi_rows * mi_cols;
     cpi->tpl_stats[frame].is_valid = 0;
     cpi->tpl_stats[frame].width = mi_cols;
     cpi->tpl_stats[frame].height = mi_rows;
@@ -1385,8 +1405,8 @@ void vp9_free_tpl_buffer(VP9_COMP *cpi) {
 #endif
     vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
     cpi->tpl_stats[frame].is_valid = 0;
-    vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
   }
+  free_tpl_frame_stats_list(&cpi->tpl_gop_stats);
 }
 
 #if CONFIG_RATE_CTRL
@@ -1442,6 +1462,9 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
 
   init_tpl_stats(cpi);
 
+  init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
+                                    cpi->tpl_stats, tpl_group_frames);
+
   // Backward propagation from tpl_group_frames to 1.
   for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
     if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 7150f74284..62128ff28a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1789,16 +1789,16 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
-  VpxTplFrameStats *data = va_arg(args, VpxTplFrameStats *);
+  VpxTplGopStats *data = va_arg(args, VpxTplGopStats *);
+  VpxTplFrameStats *frame_stats_list = cpi->tpl_gop_stats.frame_stats_list;
   int i;
   if (data == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  for (i = 0; i < MAX_ARF_GOP_SIZE; i++) {
-    data[i].frame_width = cpi->tpl_frame_stats[i].frame_width;
-    data[i].frame_height = cpi->tpl_frame_stats[i].frame_height;
-    data[i].num_blocks = cpi->tpl_frame_stats[i].num_blocks;
-    data[i].block_stats_list = cpi->tpl_frame_stats[i].block_stats_list;
+  data->size = cpi->tpl_gop_stats.size;
+
+  for (i = 0; i < data->size; i++) {
+    data->frame_stats_list[i] = frame_stats_list[i];
   }
 
   return VPX_CODEC_OK;
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 2de8089736..fb95723dd3 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -271,6 +271,12 @@ typedef struct VpxTplFrameStats {
   VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
 } VpxTplFrameStats;
 
+/*!\brief Temporal dependency model stats for each GOP before propagation */
+typedef struct VpxTplGopStats {
+  int size; /**< GOP size, also the size of frame_stats_list. */
+  VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
+} VpxTplGopStats;
+
 /*!\brief Encoded Frame Flags
  *
  * This type indicates a bitfield to be passed to vpx_codec_encode(), defining

From 3fe13658846564f37399035146132ee2af2b1ba6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 May 2023 16:56:51 -0700
Subject: [PATCH 720/926] configure: add clang-cl vs1[67] arm64 targets

x86 and armv7 are skipped for now as the intrinsics will need different
flags than cl.exe (/arch:... -> -m...).

Bug: webm:1788
Change-Id: I8ca8660a8644cdd84c51cb1f75005e371ba8207d
---
 README                         |  4 ++++
 build/make/gen_msvs_vcxproj.sh | 38 +++++++++++++++++++++++-----------
 configure                      |  2 ++
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/README b/README
index 87e46f99d4..9fa50038fb 100644
--- a/README
+++ b/README
@@ -69,6 +69,10 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     arm64-linux-gcc
     arm64-win64-gcc
     arm64-win64-vs15
+    arm64-win64-vs16
+    arm64-win64-vs16-clangcl
+    arm64-win64-vs17
+    arm64-win64-vs17-clangcl
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 58bb66b9e3..482d88f49b 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -141,7 +141,17 @@ for opt in "$@"; do
     case "$opt" in
         --help|-h) show_help
         ;;
-        --target=*) target="${optval}"
+        --target=*)
+            target="${optval}"
+            platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}')
+            case "$platform_toolset" in
+                clangcl) platform_toolset="ClangCl"
+                ;;
+                "")
+                ;;
+                *) die Unrecognized Visual Studio Platform Toolset in $opt
+                ;;
+            esac
         ;;
         --out=*) outfile="$optval"
         ;;
@@ -335,17 +345,21 @@ generate_vcxproj() {
             else
                 tag_content ConfigurationType StaticLibrary
             fi
-            if [ "$vs_ver" = "14" ]; then
-                tag_content PlatformToolset v140
-            fi
-            if [ "$vs_ver" = "15" ]; then
-                tag_content PlatformToolset v141
-            fi
-            if [ "$vs_ver" = "16" ]; then
-                tag_content PlatformToolset v142
-            fi
-            if [ "$vs_ver" = "17" ]; then
-                tag_content PlatformToolset v143
+            if [ -n "$platform_toolset" ]; then
+                tag_content PlatformToolset "$platform_toolset"
+            else
+                if [ "$vs_ver" = "14" ]; then
+                    tag_content PlatformToolset v140
+                fi
+                if [ "$vs_ver" = "15" ]; then
+                    tag_content PlatformToolset v141
+                fi
+                if [ "$vs_ver" = "16" ]; then
+                    tag_content PlatformToolset v142
+                fi
+                if [ "$vs_ver" = "17" ]; then
+                    tag_content PlatformToolset v143
+                fi
             fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
diff --git a/configure b/configure
index 20707727ef..e4e6acd107 100755
--- a/configure
+++ b/configure
@@ -106,7 +106,9 @@ all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
 all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs16-clangcl"
 all_platforms="${all_platforms} arm64-win64-vs17"
+all_platforms="${all_platforms} arm64-win64-vs17-clangcl"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8

From 3916e0e1308acd8511269ca4ed6394601f72cc10 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 May 2023 19:00:08 -0700
Subject: [PATCH 721/926] gen_msvs_vcxproj: add ARM64EC w/VS >= 2022

rather than define new targets, add a platform to the arm64 list as they
share the same configuration.

Bug: webm:1788
Change-Id: Iac020280b1103fb12b559f21439aeff26568fba4
---
 build/make/gen_msvs_vcxproj.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 482d88f49b..1e1db05bb2 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -269,6 +269,10 @@ case "$target" in
     ;;
     arm64*)
         platforms[0]="ARM64"
+        # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC.
+        if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then
+            platforms[1]="ARM64EC"
+        fi
         asm_Debug_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
         asm_Release_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
     ;;

From fbbe1d0115efc41a3c7001cc161aa1ec64a9f711 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 8 May 2023 11:48:15 -0700
Subject: [PATCH 722/926] vp8_macros_msa.h: clear -Wshadow warnings

Bug: webm:1793
Change-Id: Ia940b06bd23a915a050432e03bb630567e891d8d
---
 vp8/common/mips/msa/vp8_macros_msa.h | 258 +++++++++++++--------------
 1 file changed, 129 insertions(+), 129 deletions(-)

diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index 7cb3c98690..cc85b9a1f7 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -40,160 +40,160 @@
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile("lw  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile("lw  %[lw_val_m],  %[lw_psrc_m]  \n\t" \
+                                                        \
+                 : [lw_val_m] "=r"(lw_val_m)            \
+                 : [lw_psrc_m] "m"(*lw_psrc_m));        \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile("ld  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile("ld  %[ld_val_m],  %[ld_psrc_m]  \n\t" \
+                                                        \
+                 : [ld_val_m] "=r"(ld_val_m)            \
+                 : [ld_psrc_m] "m"(*ld_psrc_m));        \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_ld = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_ld);                                   \
-    val1_m = LW(psrc_ld + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc);           \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m);                                    \
+    ld_val1_m = LW(ld_psrc_m + 4);                                \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint16_t val_m = (val);                 \
-                                                  \
-    asm volatile("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SH(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);             \
+    const uint16_t sh_val_m = (val);                    \
+                                                        \
+    asm volatile("sh  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                        \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)         \
+                 : [sh_val_m] "r"(sh_val_m));           \
   }
 
-#define SW(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint32_t val_m = (val);                 \
-                                                  \
-    asm volatile("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SW(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);             \
+    const uint32_t sw_val_m = (val);                    \
+                                                        \
+    asm volatile("sw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                        \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)         \
+                 : [sw_val_m] "r"(sw_val_m));           \
   }
 
-#define SD(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint64_t val_m = (val);                 \
-                                                  \
-    asm volatile("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SD(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sd_pdst_m = (uint8_t *)(pdst);             \
+    const uint64_t sd_val_m = (val);                    \
+                                                        \
+    asm volatile("sd  %[sd_val_m],  %[sd_pdst_m]  \n\t" \
+                                                        \
+                 : [sd_pdst_m] "=m"(*sd_pdst_m)         \
+                 : [sd_val_m] "r"(sd_val_m));           \
   }
 #else  // !(__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile(                                    \
-        "lwr %[val_m], 0(%[psrc_m]) \n\t"            \
-        "lwl %[val_m], 3(%[psrc_m]) \n\t"            \
-        : [val_m] "=&r"(val_m)                       \
-        : [psrc_m] "r"(psrc_m));                     \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile(                                       \
+        "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t"         \
+        "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t"         \
+        : [lw_val_m] "=&r"(lw_val_m)                    \
+        : [lw_psrc_m] "r"(lw_psrc_m));                  \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile(                                    \
-        "ldr %[val_m], 0(%[psrc_m]) \n\t"            \
-        "ldl %[val_m], 7(%[psrc_m]) \n\t"            \
-        : [val_m] "=&r"(val_m)                       \
-        : [psrc_m] "r"(psrc_m));                     \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile(                                       \
+        "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t"         \
+        "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t"         \
+        : [ld_val_m] "=&r"(ld_val_m)                    \
+        : [ld_psrc_m] "r"(ld_psrc_m));                  \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m1);                                   \
-    val1_m = LW(psrc_m1 + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc);          \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m1);                                   \
+    ld_val1_m = LW(ld_psrc_m1 + 4);                               \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
-#define SH(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint16_t val_m = (val);                  \
-                                                   \
-    asm volatile("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SH(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);              \
+    const uint16_t sh_val_m = (val);                     \
+                                                         \
+    asm volatile("ush  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                         \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)          \
+                 : [sh_val_m] "r"(sh_val_m));            \
   }
 
-#define SW(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint32_t val_m = (val);                  \
-                                                   \
-    asm volatile("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SW(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);              \
+    const uint32_t sw_val_m = (val);                     \
+                                                         \
+    asm volatile("usw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                         \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)          \
+                 : [sw_val_m] "r"(sw_val_m));            \
   }
 
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
+#define SD(val, pdst)                                           \
+  {                                                             \
+    uint8_t *sd_pdst_m1 = (uint8_t *)(pdst);                    \
+    uint32_t sd_val0_m, sd_val1_m;                              \
+                                                                \
+    sd_val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                                \
+    SW(sd_val0_m, sd_pdst_m1);                                  \
+    SW(sd_val1_m, sd_pdst_m1 + 4);                              \
   }
 #endif  // (__mips_isa_rev >= 6)
 

From 457b7f59860955415a23c20c535fc13fde51936f Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Mon, 8 May 2023 12:10:09 +0530
Subject: [PATCH 723/926] Add AVX2 intrinsic for vpx_comp_avg_pred() function

The module level scaling w.r.t C function (timer based) for
existing (SSE2) and new AVX2 intrinsics:

If ref_padding = 0
Block     Scaling
size    SSE2    AVX2
8x4     3.24x   3.24x
8x8     4.22x   4.90x
8x16    5.91x   5.93x
16x8    1.63x   3.52x
16x16   1.53x   4.19x
16x32   1.38x   4.82x
32x16   1.28x   3.08x
32x32   1.45x   3.13x
32x64   1.38x   3.04x
64x32   1.39x   2.12x
64x64   1.46x   2.24x

If ref_padding = 8
Block     Scaling
size    SSE2    AVX2
8x4     3.20x   3.21x
8x8     4.61x   4.83x
8x16    5.50x   6.45x
16x8    1.56x   3.35x
16x16   1.53x   4.19x
16x32   1.37x   4.83x
32x16   1.28x   3.07x
32x32   1.46x   3.29x
32x64   1.38x   3.22x
64x32   1.38x   2.14x
64x64   1.38x   2.12x

This is a bit-exact change.

Change-Id: I72c5d155f64d0c630bc8c3aef21dc8bbd045d9e6
---
 test/comp_avg_pred_test.cc   |  21 ++++---
 vp9/encoder/vp9_mcomp.c      |   8 +--
 vp9/encoder/vp9_rdopt.c      |   4 +-
 vpx_dsp/sad.c                |   2 +-
 vpx_dsp/variance.c           |   2 +-
 vpx_dsp/vpx_dsp.mk           |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 vpx_dsp/x86/avg_pred_avx2.c  | 111 +++++++++++++++++++++++++++++++++++
 8 files changed, 134 insertions(+), 17 deletions(-)
 create mode 100644 vpx_dsp/x86/avg_pred_avx2.c

diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index f747c3524e..d8fabd5bef 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -81,11 +81,11 @@ void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() {
         // Only the reference buffer may have a stride not equal to width.
         Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(pred.Init());
-        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg_ref.Init());
-        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg_chk.Init());
         const int bitdepth_mask = (1 << bitdepth) - 1;
         for (int h = 0; h < height; ++h) {
@@ -121,11 +121,11 @@ void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() {
   const int height = 32;
   Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
   ASSERT_TRUE(ref.Init());
-  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(pred.Init());
-  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(avg_ref.Init());
-  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(avg_chk.Init());
 
   for (int i = 0; i < 500; ++i) {
@@ -167,9 +167,9 @@ void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
         const int height = 1 << height_pow;
         Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(pred.Init());
-        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg.Init());
         const int bitdepth_mask = (1 << bitdepth) - 1;
         for (int h = 0; h < height; ++h) {
@@ -217,6 +217,11 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_sse2));
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_avx2));
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_neon));
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 64e9ef0f91..0ea0f85e42 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -297,7 +297,7 @@ static unsigned int setup_center_error(
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -312,7 +312,7 @@ static unsigned int setup_center_error(
   uint32_t besterr;
   (void)xd;
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -635,7 +635,7 @@ static int accurate_sub_pel_search(
     vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                               0, kernel, MV_PRECISION_Q3, 0, 0);
     if (second_pred != NULL) {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
       besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
     } else {
@@ -654,7 +654,7 @@ static int accurate_sub_pel_search(
   vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                             0, kernel, MV_PRECISION_Q3, 0, 0);
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
     besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
   } else {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f051c62791..464705a678 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1937,10 +1937,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
 // Prediction buffer from second frame.
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]);
   uint8_t *second_pred;
 #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Check number of iterations do not exceed the max
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 619d7aa956..2a4c81d588 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -40,7 +40,7 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
   unsigned int vpx_sad##m##x##n##_avg_c(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]);                           \
     vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
   }                                                                           \
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index ce1e8382b9..a6793efb68 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -156,7 +156,7 @@ static void var_filter_block2d_bil_second_pass(
       const uint8_t *second_pred) {                                          \
     uint16_t fdata3[(H + 1) * W];                                            \
     uint8_t temp2[H * W];                                                    \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                              \
+    DECLARE_ALIGNED(32, uint8_t, temp3[H * W]);                              \
                                                                              \
     var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
                                       W, bilinear_filters[x_offset]);        \
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 67d3fb0e29..04969f37e1 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -424,6 +424,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/avg_pred_lsx.c
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/avg_pred_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/variance_vsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index cae4ca8116..f20f4e0454 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1321,7 +1321,7 @@ ()
   specialize qw/vpx_get4x4sse_cs neon msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/;
 
 #
 # Subpixel Variance
diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c
new file mode 100644
index 0000000000..f4357998c9
--- /dev/null
+++ b/vpx_dsp/x86/avg_pred_avx2.c
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int row = 0;
+  // comp_pred and pred must be 32 byte aligned.
+  assert(((intptr_t)comp_pred % 32) == 0);
+  assert(((intptr_t)pred % 32) == 0);
+
+  if (width == 8) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i p = _mm256_load_si256((const __m256i *)pred);
+      const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+      const __m128i r_1 =
+          _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+
+      const __m128i r1 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride)));
+      const __m128i r2 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride)));
+
+      const __m256i ref_0123 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
+      const __m256i avg = _mm256_avg_epu8(p, ref_0123);
+
+      _mm256_store_si256((__m256i *)comp_pred, avg);
+
+      row += 4;
+      pred += 32;
+      comp_pred += 32;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 16) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i tmp0 =
+          _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref));
+      const __m256i ref_0 = _mm256_inserti128_si256(
+          tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+      const __m256i tmp1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+      const __m256i ref_1 = _mm256_inserti128_si256(
+          tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 4;
+      pred += 64;
+      comp_pred += 64;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i ref_1 =
+          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 2;
+      pred += 64;
+      comp_pred += 64;
+      ref += 2 * ref_stride;
+    } while (row < height);
+  } else if (width % 64 == 0) {
+    do {
+      int x;
+      for (x = 0; x < width; x += 64) {
+        const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x));
+        const __m256i pred_1 =
+            _mm256_load_si256((const __m256i *)(pred + x + 32));
+        const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+        const __m256i ref_1 =
+            _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+        const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+        const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+        _mm256_store_si256((__m256i *)(comp_pred + x), average_0);
+        _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1);
+      }
+      row++;
+      pred += width;
+      comp_pred += width;
+      ref += ref_stride;
+    } while (row < height);
+  } else {
+    vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride);
+  }
+}

From 9e0fc37f6f68685066f3e71e1cd0605d6ee2205e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 17 Apr 2023 13:42:11 -0700
Subject: [PATCH 724/926] configure: add -Wshadow

libraries under third_party/ are out of scope for this change.

Bug: webm:1793
Change-Id: I562065a3c0ea9fdfc9615d1a6b1ae47da79b8ce0
---
 build/make/configure.sh | 1 +
 configure               | 6 +++++-
 examples.mk             | 1 +
 test/test.mk            | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index ec9af5e63d..6fd67f1623 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -521,6 +521,7 @@ AS_SFX    = ${AS_SFX:-.asm}
 EXE_SFX   = ${EXE_SFX}
 VCPROJ_SFX = ${VCPROJ_SFX}
 RTCD_OPTIONS = ${RTCD_OPTIONS}
+LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS}
 LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS}
 EOF
 
diff --git a/configure b/configure
index e4e6acd107..b73436b47e 100755
--- a/configure
+++ b/configure
@@ -649,6 +649,7 @@ process_toolchain() {
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wmissing-declarations
         check_add_cflags -Wmissing-prototypes
+        check_add_cflags -Wshadow
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunreachable-code-loop-increment
         check_add_cflags -Wunused
@@ -679,13 +680,16 @@ process_toolchain() {
         check_add_cxxflags -Wc++17-extensions
         check_add_cxxflags -Wc++20-extensions
 
-        # disable some warnings specific to libyuv.
+        # disable some warnings specific to libyuv / libwebm.
         check_cxxflags -Wno-missing-declarations \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
         check_cxxflags -Wno-missing-prototypes \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
         check_cxxflags -Wno-pass-failed \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
+        check_cxxflags -Wno-shadow \
+          && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow"
         check_cxxflags -Wno-unused-parameter \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
     fi
diff --git a/examples.mk b/examples.mk
index 42886f1e15..9e506dcd47 100644
--- a/examples.mk
+++ b/examples.mk
@@ -57,6 +57,7 @@ LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \
 # Add compile flags and include path for libwebm sources.
 ifeq ($(CONFIG_WEBM_IO),yes)
   CXXFLAGS     += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+  $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
   INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
 endif
 
diff --git a/test/test.mk b/test/test.mk
index bbcdd0c6e4..b64e89bb43 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -85,6 +85,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
+$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc

From 8ecf58432118b672fe3f4a54725bc63caac262aa Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 8 May 2023 17:41:26 +0100
Subject: [PATCH 725/926] Refactor standard bitdepth Neon convolution functions

1) Use #define constant instead of magic numbers for right shifts.
2) Move saturating narrow into helper functions that return 4-element
   result vectors.
3) Use mem_neon.h helpers for load/store sequences in Armv8.0 paths.
4) Tidy up: assert conditions and some longer variable names.
5) Prefer != 0 to > 0 where possible for loop termination conditions.

Change-Id: Idfcac43ca38faf729dca07b8cc8f7f45ad264d24
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 786 +++++++++++++------------------
 vpx_dsp/arm/vpx_convolve8_neon.h |  35 +-
 2 files changed, 349 insertions(+), 472 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index b312cc747c..f217a3f35d 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -17,6 +17,7 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 // Note:
@@ -64,8 +65,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
   uint8x16_t s0, s1, s2, s3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -75,22 +76,19 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
     do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
+      int16x4_t t0, t1, t2, t3;
       uint8x8_t d01, d23;
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
-      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
-      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
-      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -100,7 +98,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h > 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
     uint8_t *d;
     int width;
@@ -113,17 +111,17 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
-        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
-        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
-        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
@@ -139,8 +137,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
   uint8x16_t s0, s1, s2, s3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -150,24 +148,19 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
     do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
+      int16x4_t t0, t1, t2, t3;
       uint8x8_t d01, d23, dd01, dd23;
-      dd01 = vdup_n_u8(0);
-      dd23 = vdup_n_u8(0);
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
-      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
-      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
-      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -181,9 +174,9 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
     uint8_t *d;
     int width;
@@ -196,10 +189,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
-        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
-        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
-        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
 
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
@@ -213,11 +206,11 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   }
 }
 
@@ -275,8 +268,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   uint8x16x2_t samples_LUT;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -288,7 +281,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
     uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
+    int16x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23;
 
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -325,8 +318,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
       d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
       d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -341,7 +334,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
     uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -426,11 +419,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -444,8 +437,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   uint8x16x2_t samples_LUT;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -457,7 +450,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
     uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
+    int16x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23, dd01, dd23;
 
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -494,8 +487,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
       d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
       d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -516,7 +509,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
     uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -608,11 +601,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -629,8 +622,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const uint8x16_t range_limit = vdupq_n_u8(128);
   uint8x16_t s0, s1, s2, s3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -640,22 +633,19 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
     do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
+      int16x4_t t0, t1, t2, t3;
       uint8x8_t d01, d23;
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -665,7 +655,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h > 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
     uint8_t *d;
     int width;
@@ -678,21 +668,17 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 =
-            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
-        d1 =
-            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
-        d2 =
-            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
-        d3 =
-            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
@@ -711,8 +697,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const uint8x16_t range_limit = vdupq_n_u8(128);
   uint8x16_t s0, s1, s2, s3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -722,24 +708,19 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
     do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
+      int16x4_t t0, t1, t2, t3;
       uint8x8_t d01, d23, dd01, dd23;
-      dd01 = vdup_n_u8(0);
-      dd23 = vdup_n_u8(0);
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -753,9 +734,9 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
     uint8_t *d;
     int width;
@@ -768,14 +749,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 =
-            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
-        d1 =
-            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
-        d2 =
-            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
-        d3 =
-            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
 
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
@@ -789,11 +766,11 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   }
 }
 
@@ -854,8 +831,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   int8x16x2_t samples_LUT;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -867,7 +844,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
     int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
+    int16x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23;
 
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -919,8 +896,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
       d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
       d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -935,7 +912,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
     int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -1035,11 +1012,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -1057,8 +1034,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   int8x16x2_t samples_LUT;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -1070,7 +1047,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
     int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
+    int16x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23, dd01, dd23;
 
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -1122,8 +1099,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
       d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
       d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -1144,7 +1121,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
     int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -1251,11 +1228,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -1273,8 +1250,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -1286,25 +1263,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   if (h == 4) {
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
     __builtin_prefetch(src + 2 * src_stride);
     __builtin_prefetch(src + 3 * src_stride);
+
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     __builtin_prefetch(dst + 0 * dst_stride);
     __builtin_prefetch(dst + 1 * dst_stride);
     __builtin_prefetch(dst + 2 * dst_stride);
@@ -1314,32 +1288,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
       transpose_u8_4x4(&d01, &d23);
 
-      vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
-                    vreinterpret_u32_u8(d01), 0);
-      vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
-                    vreinterpret_u32_u8(d23), 0);
-      vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
-                    vreinterpret_u32_u8(d01), 1);
-      vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
-                    vreinterpret_u32_u8(d23), 1);
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1355,7 +1319,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   } else {
     int width;
     const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7;
+    uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
@@ -1395,32 +1359,24 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
-        dst += dst_stride;
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+        dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
     } else {
       uint8_t *d;
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
       int16x8_t s11, s12, s13, s14;
 
       do {
@@ -1466,17 +1422,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -1505,8 +1462,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -1516,10 +1473,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (h == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t d01, d23, dd01, dd23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
@@ -1527,17 +1482,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     __builtin_prefetch(src + 3 * src_stride);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     __builtin_prefetch(dst + 0 * dst_stride);
     __builtin_prefetch(dst + 1 * dst_stride);
     __builtin_prefetch(dst + 2 * dst_stride);
@@ -1547,35 +1499,28 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
       transpose_u8_4x4(&d01, &d23);
 
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+      dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+      dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
 
-      vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1595,8 +1540,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
-      uint32x4_t d0415 = vdupq_n_u32(0);
-      uint32x4_t d2637 = vdupq_n_u32(0);
+      uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
+
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1633,48 +1578,35 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
-        d0415 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
-        d2637 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
-
-        vst1q_lane_u32((uint32_t *)dst, d0415, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 3);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 3);
-        dst += dst_stride;
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+        dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+        dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+        dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
+
+        d04 = vrhadd_u8(d04, dd04);
+        d15 = vrhadd_u8(d15, dd15);
+        d26 = vrhadd_u8(d26, dd26);
+        d37 = vrhadd_u8(d37, dd37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+        dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     } else {
       uint8_t *d;
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
       int16x8_t s11, s12, s13, s14;
-      uint8x16_t d01, d23, d45, d67;
 
       do {
         __builtin_prefetch(src + 0 * src_stride);
@@ -1719,33 +1651,27 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-          d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                            vld1_u8(d + 1 * dst_stride));
-          d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                            vld1_u8(d + 3 * dst_stride));
-          d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
-                            vld1_u8(d + 5 * dst_stride));
-          d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
-                            vld1_u8(d + 7 * dst_stride));
-          d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
-          d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
-          d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
-          d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
-
-          store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
-                       vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
-                       vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+          d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+          d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+          d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+          d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+          d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride));
+          d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride));
+          d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride));
+          d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride));
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -1761,7 +1687,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     }
   }
 }
@@ -1773,8 +1699,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              int h) {
   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -1784,33 +1710,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
 
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(dst + 1 * dst_stride);
@@ -1820,21 +1739,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
+
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1843,13 +1757,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h != 0);
   } else {
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
@@ -1860,33 +1776,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 4 * src_stride);
       __builtin_prefetch(src + 5 * src_stride);
       __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
+
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src + 7 * src_stride;
       d = dst;
       height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
         __builtin_prefetch(d + 0 * dst_stride);
         __builtin_prefetch(d + 1 * dst_stride);
@@ -1896,19 +1805,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        vst1_u8(d, t0);
-        d += dst_stride;
-        vst1_u8(d, t1);
-        d += dst_stride;
-        vst1_u8(d, t2);
-        d += dst_stride;
-        vst1_u8(d, t3);
-        d += dst_stride;
+
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -1917,6 +1820,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s4 = s8;
         s5 = s9;
         s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
         height -= 4;
       } while (height != 0);
       src += 8;
@@ -1933,8 +1838,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  int h) {
   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -1944,34 +1849,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
-
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
 
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(dst + 1 * dst_stride);
@@ -1981,29 +1878,22 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
+
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
-
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
-
-      vst1q_lane_u32((uint32_t *)dst, d0123, 0);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 1);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 2);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 3);
-      dst += dst_stride;
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -2012,14 +1902,15 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h != 0);
   } else {
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
-    uint8x16_t d01, d23, dd01, dd23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
@@ -2030,33 +1921,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 4 * src_stride);
       __builtin_prefetch(src + 5 * src_stride);
       __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
+
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src + 7 * src_stride;
       d = dst;
       height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
         __builtin_prefetch(d + 0 * dst_stride);
         __builtin_prefetch(d + 1 * dst_stride);
@@ -2066,28 +1950,18 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        d01 = vcombine_u8(t0, t1);
-        d23 = vcombine_u8(t2, t3);
-        dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                           vld1_u8(d + 1 * dst_stride));
-        dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                           vld1_u8(d + 3 * dst_stride));
-        dd01 = vrhaddq_u8(dd01, d01);
-        dd23 = vrhaddq_u8(dd23, d23);
-
-        vst1_u8(d, vget_low_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_low_u8(dd23));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd23));
-        d += dst_stride;
+
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+        d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+        d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+        d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -2097,6 +1971,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5 = s9;
         s6 = s10;
         height -= 4;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
       } while (height != 0);
       src += 8;
       dst += 8;
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 07cf8242d3..c838d40470 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -15,10 +15,11 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
 
 #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
-static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
                                                  const int32x4_t correction,
                                                  const int8x8_t filters) {
@@ -29,11 +30,11 @@ static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
   sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
   sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
-static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const int32x4_t correction,
                                          const uint8x16_t range_limit,
@@ -54,8 +55,8 @@ static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
   sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
   sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
 static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
@@ -78,7 +79,7 @@ static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
@@ -111,14 +112,14 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 #endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
 #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
-static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
                                                   const int8x8_t filters) {
   /* Sample permutation is performed by the caller. */
@@ -127,11 +128,11 @@ static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
   sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
   sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
-static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
                                           const int8x8_t filters,
                                           const uint8x16x2_t permute_tbl) {
   uint8x16_t permuted_samples[2];
@@ -147,8 +148,8 @@ static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
   sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
   sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
 static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
@@ -169,7 +170,7 @@ static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
@@ -196,7 +197,7 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 #endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
@@ -238,7 +239,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
   sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
   sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,

From 2a9b810d3df62ff3c527ce3895f6b80d9d6f6296 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 8 May 2023 16:58:59 -0700
Subject: [PATCH 726/926] Don't use -Wl,-z,defs with Clang's sanitizers

This avoids link errors related to the sanitizers:
https://clang.llvm.org/docs/AddressSanitizer.html#usage
"When linking shared libraries, the AddressSanitizer run-time is not
linked, so -Wl,-z,defs may cause link errors ..."

See also:
https://crbug.com/aomedia/3438

Bug: webm:1801
Fixed: webm:1801
Change-Id: Ie212318005a5f7222e5486775175534025306367
---
 build/make/Makefile | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/build/make/Makefile b/build/make/Makefile
index 5c38c18e57..65ac2290c7 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -304,6 +304,19 @@ $(1):
 	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
 endef
 
+# Don't use -Wl,-z,defs with Clang's sanitizers.
+#
+# Clang's AddressSanitizer documentation says "When linking shared libraries,
+# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link
+# errors (don't use it with AddressSanitizer)." See
+# https://clang.llvm.org/docs/AddressSanitizer.html#usage.
+NO_UNDEFINED := -Wl,-z,defs
+ifeq ($(findstring clang,$(CC)),clang)
+    ifneq ($(filter -fsanitize=%,$(LDFLAGS)),)
+        NO_UNDEFINED :=
+    endif
+endif
+
 define so_template
 # Not using a pattern rule here because we don't want to generate empty
 # archives when they are listed as a dependency in files not responsible
@@ -313,7 +326,8 @@ define so_template
 $(1):
 	$(if $(quiet),@echo "    [LD] $$@")
 	$(qexec)$$(LD) -shared $$(LDFLAGS) \
-            -Wl,--no-undefined -Wl,-soname,$$(SONAME) \
+            $(NO_UNDEFINED) \
+            -Wl,-soname,$$(SONAME) \
             -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \
             $$(filter %.o,$$^) $$(extralibs)
 endef

From 3e1e38d1176c34f71a87f8402c07cdcc2e20083e Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 4 May 2023 16:33:38 +0100
Subject: [PATCH 727/926] Add 2D-specific Neon horizontal convolution functions

2D 8-tap convolution filtering is performed in two passes -
horizontal and vertical. The horizontal pass must produce enough
input data for the subsequent vertical pass - 3 rows above and 4 rows
below, in addition to the actual block height.

At present, all Neon horizontal convolution algorithms process 4 rows
at a time, but this means we end up doing at least 1 row too much
work in the 2D first pass case where we need h + 7, not h + 8 rows of
output.

This patch adds additional dot-product (SDOT and USDOT) Neon paths
that process h + 7 rows of data exactly, saving the work of the
unnecessary extra row. It is impractical to take a similar approach
for the Armv8.0 MLA paths since we have to transpose the data block
both before and after calling the convolution helper functions.

vpx_convolve_neon performance impact: we observe a speedup of ~9% for
smaller (and wider) blocks, and a speedup of 0-3% for larger blocks.
This is to be expected since the proportion of redundant work
decreases as the block height increases.

Change-Id: Ie77ad1848707d2d48bb8851345a469aae9d097e1
---
 vpx_dsp/arm/mem_neon.h           |  20 +++
 vpx_dsp/arm/vpx_convolve8_neon.c | 221 ++++++++++++++++++++++++++++++-
 vpx_dsp/arm/vpx_convolve8_neon.h |   9 ++
 vpx_dsp/arm/vpx_convolve_neon.c  |  55 ++++++++
 4 files changed, 301 insertions(+), 4 deletions(-)

diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 1a20da70ef..586bfb85af 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -263,6 +263,16 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
   vst1_lane_u32((uint32_t *)buf, a_u32, 1);
 }
 
+static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+}
+
 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3) {
@@ -287,6 +297,16 @@ static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
   vst1_u8(s, s3);
 }
 
+static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+}
+
 static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3) {
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index f217a3f35d..505d0672f0 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -57,6 +57,111 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
 
 #if defined(__ARM_FEATURE_MATMUL_INT8)
 
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+  assert(h % 4 == 3);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      d3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+    d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+    d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+      d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+      d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -96,7 +201,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
@@ -125,7 +230,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   }
 }
 
@@ -611,6 +716,114 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
 
 #else  // !defined(__ARM_FEATURE_MATMUL_INT8)
 
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+  assert(h % 4 == 3);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+    d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+    d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+      d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+      d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -653,7 +866,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
@@ -682,7 +895,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   }
 }
 
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index c838d40470..2f78583af3 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -17,6 +17,15 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_filter.h"
 
+#if VPX_ARCH_AARCH64 && \
+    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h);
+#endif
+
 #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 830f3176d7..f7db3e6a9c 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -14,6 +14,57 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
+#if VPX_ARCH_AARCH64 && \
+    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. */
+  vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                              intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  uint8_t temp[64 * 71];
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                              intermediate_height);
+
+  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+#else  // !(VPX_ARCH_AARCH64 &&
+       //   (defined(__ARM_FEATURE_DOTPROD) ||
+       //    defined(__ARM_FEATURE_MATMUL_INT8)))
+
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
@@ -63,3 +114,7 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
   vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
                               x_step_q4, y0_q4, y_step_q4, w, h);
 }
+
+#endif  // #if VPX_ARCH_AARCH64 &&
+        //     (defined(__ARM_FEATURE_DOTPROD) ||
+        //      defined(__ARM_FEATURE_MATMUL_INT8))

From e6b9a8d667bb43c58437bb1d6204ffc8047252ac Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Fri, 12 May 2023 10:56:45 +0530
Subject: [PATCH 728/926] Improve convolve AVX2 intrinsic for speed

This CL refactors the code related to convolve function.
Furthermore, improved the AVX2 intrinsic to compute
convolve vertical for w = 4 case, and convolve horiz for
w = 16 case.

Please note the module level scaling w.r.t C function
(timer based) for existing (AVX2) and new AVX2 intrinsics:

Block     Scaling
Size   AVX2       AVX2
     (existing)   (New)
4x4    5.34x      5.91x
4x8    7.10x      7.79x
16x8  23.52x     25.63x
16x16 29.47x     30.22x
16x32 33.42x     33.44x

This is a bit exact change.

Change-Id: If130183bc12faab9ca2bcec0ceeaa8d0af05e413
---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 226 +++++++---------------
 1 file changed, 71 insertions(+), 155 deletions(-)

diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 2498bba173..526c283823 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -46,7 +46,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
 };
 
 #define CALC_CONVOLVE8_HORZ_ROW                                               \
-  srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);             \
+  srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);          \
   s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]);                               \
   s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]);                               \
   s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]);                               \
@@ -60,16 +60,6 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
                    _mm256_extractf128_si256(s1[0], 1));                       \
   output_ptr += output_pitch;
 
-// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-
-  // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
 static INLINE void vpx_filter_block1d16_h8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
@@ -93,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
     __m256i srcReg;
 
     // load the 2 strides of source
-    srcReg =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg = _mm256_inserti128_si256(
-        srcReg,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3);
 
     // filter the source buffer
     s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -109,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg = _mm256_inserti128_si256(
-        srcReg,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5);
 
     // filter the source buffer
     s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -129,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
 
     src_ptr += src_stride;
 
-    // average if necessary
-    outReg1 = _mm256_castsi256_si128(outReg32b1);
-    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
     if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-      outReg2 = _mm_avg_epu8(
-          outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch));
+      outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg);
     }
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
-
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + output_pitch), &outReg32b1);
     output_ptr += dst_stride;
   }
 
   // if the number of strides is odd.
   // process only 16 bytes
   if (i > 0) {
-    __m128i srcReg;
-
-    // load the first 16 bytes of the last row
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    const __m256i srcReg =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1);
 
     // filter the source buffer
-    s[0] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
-    s[1] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
-    s[2] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
-    s[3] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
-    outReg1 = convolve8_8_avx2(s, f);
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
 
-    // filter the source buffer
-    s[0] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
-    s[1] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
-    s[2] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
-    s[3] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
-    outReg2 = convolve8_8_avx2(s, f);
+    // The low and high 128-bits of each lane contain the first and second
+    // convolve result respectively
+    outReg32b1 = convolve8_16_avx2(s, f);
+    outReg1 = _mm256_castsi256_si128(outReg32b1);
+    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
 
-    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
-    // contain the first and second convolve result respectively
+    // shrink to 8 bit each 16 bits
     outReg1 = _mm_packus_epi16(outReg1, outReg2);
 
     // average if necessary
@@ -266,7 +223,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
     const int avg) {
-  __m128i outReg1, outReg2;
   __m256i srcRegHead1;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
@@ -345,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     src_ptr += src_stride;
 
     // average if necessary
-    outReg1 = _mm256_castsi256_si128(s1[0]);
-    outReg2 = _mm256_extractf128_si256(s1[0], 1);
     if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-      outReg2 = _mm_avg_epu8(
-          outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch));
+      s1[0] = _mm256_avg_epu8(s1[0], outReg);
     }
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + out_pitch), s1);
 
     output_ptr += dst_stride;
 
@@ -1094,7 +1045,7 @@ static void vpx_filter_block1d4_h8_avx2(
       // load the 2 strides of source
       // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
       // r06 r05 r04 r03 r02 r01 r00
-      srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);
+      srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);
 
       // filter the source buffer
       // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
@@ -1188,8 +1139,7 @@ static void vpx_filter_block1d4_v8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
   __m256i f[4], ss[4];
-  __m256i r[8];
-  __m128i r1[10];
+  __m256i r[9], rr[2];
   __m128i s[11];
 
   unsigned int y = output_height;
@@ -1210,48 +1160,35 @@ static void vpx_filter_block1d4_v8_avx2(
   s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
   s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
 
-  // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00
-  r1[0] = _mm_unpacklo_epi32(s[0], s[1]);
-
-  // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10
-  r1[1] = _mm_unpacklo_epi32(s[1], s[2]);
-
-  // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20
-  r1[2] = _mm_unpacklo_epi32(s[2], s[3]);
-
-  // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30
-  r1[3] = _mm_unpacklo_epi32(s[3], s[4]);
-
-  // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40
-  r1[4] = _mm_unpacklo_epi32(s[4], s[5]);
-
-  // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50
-  r1[5] = _mm_unpacklo_epi32(s[5], s[6]);
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1);
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1);
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1);
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1);
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1);
 
-  // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02
-  // r01 r00
-  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1);
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[0], r[1]);
 
-  // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12
-  // r11 r10
-  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1);
-
-  // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22
-  // r21 r20
-  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1);
-
-  // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32
-  // r31 r30
-  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1);
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[1], r[2]);
 
   // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
   // r00|
-  ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+  ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[2], r[3]);
+
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[3], r[4]);
 
   // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
   // r20|
-  ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
-
+  ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]);
   // Process 4 rows at a time
   while (y >= 4) {
     s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
@@ -1259,41 +1196,17 @@ static void vpx_filter_block1d4_v8_avx2(
     s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
     s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
 
-    // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
-    r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
-
-    // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
-    r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
-
-    // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80
-    r1[8] = _mm_unpacklo_epi32(s[8], s[9]);
-
-    // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90
-    r1[9] = _mm_unpacklo_epi32(s[9], s[10]);
-
-    // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43
-    // r42 r41 r40
-    r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1);
-
-    // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53
-    // r52 r51 r50
-    r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1);
+    r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1);
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[4], r[5]);
+    rr[1] = _mm256_unpacklo_epi32(r[5], r[6]);
+    ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]);
 
-    // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63
-    // r62 r61 r60
-    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1);
-
-    // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81
-    // r80|r73 r72 r71 r70
-    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1);
-
-    // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50
-    // r40|
-    ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
-
-    // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73
-    // r63....r70 r60|
-    ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1);
+    r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[6], r[7]);
+    rr[1] = _mm256_unpacklo_epi32(r[7], r[8]);
+    ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]);
 
     ss[0] = convolve8_16_avx2(ss, f);
 
@@ -1315,17 +1228,17 @@ static void vpx_filter_block1d4_v8_avx2(
     ss[1] = ss[3];
 
     s[6] = s[10];
+    s[5] = s[9];
 
-    r1[4] = r1[8];
-    r1[5] = r1[9];
-
+    r[4] = r[8];
     y -= 4;
   }
 
   // Process 2 rows
   if (y == 2) {
-    __m128i ss1[4], f1[4];
+    __m128i ss1[4], f1[4], r1[4];
 
+    s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
     s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
     s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
 
@@ -1334,11 +1247,14 @@ static void vpx_filter_block1d4_v8_avx2(
     f1[2] = _mm256_castsi256_si128(f[2]);
     f1[3] = _mm256_castsi256_si128(f[3]);
 
+    r1[0] = _mm_unpacklo_epi32(s[4], s[5]);
+    r1[1] = _mm_unpacklo_epi32(s[5], s[6]);
+
     // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
-    r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+    r1[2] = _mm_unpacklo_epi32(s[6], s[7]);
 
     // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
-    r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+    r1[3] = _mm_unpacklo_epi32(s[7], s[8]);
 
     // r23 r13....r20 r10|r13 r03....r10 r00
     ss1[0] = _mm256_castsi256_si128(ss[0]);
@@ -1347,10 +1263,10 @@ static void vpx_filter_block1d4_v8_avx2(
     ss1[1] = _mm256_castsi256_si128(ss[1]);
 
     // r63 r53....r60 r50|r53 r43....r50 r40
-    ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]);
+    ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]);
 
     // r83 r73....r80 r70|r73 r63....r70 r60
-    ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]);
+    ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]);
 
     ss1[0] = convolve8_8_ssse3(ss1, f1);
 

From 7e7a1706e3dadcfbb3d92d93ea735420990584da Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 16 May 2023 14:57:05 -0400
Subject: [PATCH 729/926] Add new vpx_tpl.h API file

New file (vpx_tpl.c) in the following CLs will add new APIs dealing with
TPL stats from VP9 encoder.

Change-Id: I5102ef64214cba1ca6ecea9582a19049666c6ca4
---
 libs.mk                   |  1 +
 test/encode_api_test.cc   |  2 +-
 test/encode_test_driver.h |  2 +-
 vp9/encoder/vp9_encoder.h |  1 +
 vp9/vp9_cx_iface.c        |  2 ++
 vpx/vpx_codec.mk          |  2 ++
 vpx/vpx_encoder.h         | 32 +++-----------------
 vpx/vpx_tpl.h             | 63 +++++++++++++++++++++++++++++++++++++++
 8 files changed, 75 insertions(+), 30 deletions(-)
 create mode 100644 vpx/vpx_tpl.h

diff --git a/libs.mk b/libs.mk
index 1411fee9a1..ea5cc15a17 100644
--- a/libs.mk
+++ b/libs.mk
@@ -178,6 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h
 INSTALL-LIBS-yes += include/vpx/vpx_integer.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
+INSTALL-LIBS-$(CONFIG_VP9_ENCODER) += include/vpx/vpx_tpl.h
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 INSTALL-LIBS-yes                  += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 2b0aa1fdfe..af98ad5ddd 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -20,7 +20,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
 
 namespace {
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 922c49f420..165fcfabf6 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -19,7 +19,7 @@
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
 
 namespace libvpx_test {
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 2528bc2316..2e0c4db9ed 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -18,6 +18,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vpx_ext_ratectrl.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_tpl.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 8f157274fb..409069b4ed 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -29,6 +29,8 @@
 #include "vp9/vp9_cx_iface.h"
 #include "vp9/vp9_iface_common.h"
 
+#include "vpx/vpx_tpl.h"
+
 typedef struct vp9_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
   unsigned int enable_auto_alt_ref;
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index de86579d58..4aec88b300 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -27,6 +27,7 @@ API_DOC_SRCS-yes += vpx_encoder.h
 API_DOC_SRCS-yes += vpx_ext_ratectrl.h
 API_DOC_SRCS-yes += vpx_frame_buffer.h
 API_DOC_SRCS-yes += vpx_image.h
+API_DOC_SRCS-yes += vpx_tpl.h
 
 API_SRCS-yes += src/vpx_decoder.c
 API_SRCS-yes += vpx_decoder.h
@@ -42,3 +43,4 @@ API_SRCS-yes += vpx_frame_buffer.h
 API_SRCS-yes += vpx_image.h
 API_SRCS-yes += vpx_integer.h
 API_SRCS-yes += vpx_ext_ratectrl.h
+API_SRCS-yes += vpx_tpl.h
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index fb95723dd3..c45d1a2ba5 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -31,6 +31,7 @@ extern "C" {
 
 #include "./vpx_codec.h"
 #include "./vpx_ext_ratectrl.h"
+#include "./vpx_tpl.h"
 
 /*! Temporal Scalability: Maximum length of the sequence defining frame
  * layer membership
@@ -57,9 +58,9 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_ENCODER_ABI_VERSION \
-  (16 + VPX_CODEC_ABI_VERSION + \
-   VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION                                \
+  (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
+   VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -252,31 +253,6 @@ enum vpx_kf_mode {
   VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
 };
 
-/*!\brief Temporal dependency model stats for each block before propagation */
-typedef struct VpxTplBlockStats {
-  int64_t intra_cost;  /**< Intra cost */
-  int64_t inter_cost;  /**< Inter cost */
-  int16_t mv_r;        /**< Motion vector row */
-  int16_t mv_c;        /**< Motion vector col */
-  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
-  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
-  int ref_frame_index; /**< Ref frame index */
-} VpxTplBlockStats;
-
-/*!\brief Temporal dependency model stats for each frame before propagation */
-typedef struct VpxTplFrameStats {
-  int frame_width;  /**< Frame width */
-  int frame_height; /**< Frame height */
-  int num_blocks;   /**< Number of blocks. Size of block_stats_list */
-  VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
-} VpxTplFrameStats;
-
-/*!\brief Temporal dependency model stats for each GOP before propagation */
-typedef struct VpxTplGopStats {
-  int size; /**< GOP size, also the size of frame_stats_list. */
-  VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
-} VpxTplGopStats;
-
 /*!\brief Encoded Frame Flags
  *
  * This type indicates a bitfield to be passed to vpx_codec_encode(), defining
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
new file mode 100644
index 0000000000..689fa96920
--- /dev/null
+++ b/vpx/vpx_tpl.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\file
+ * \brief Describes the TPL stats descriptor and associated operations
+ *
+ */
+#ifndef VPX_VPX_VPX_TPL_H_
+#define VPX_VPX_VPX_TPL_H_
+
+#include "./vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_TPL_ABI_VERSION (0) /**<\hideinitializer*/
+
+/*!\brief Temporal dependency model stats for each block before propagation */
+typedef struct VpxTplBlockStats {
+  int64_t intra_cost;  /**< Intra cost */
+  int64_t inter_cost;  /**< Inter cost */
+  int16_t mv_r;        /**< Motion vector row */
+  int16_t mv_c;        /**< Motion vector col */
+  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
+  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
+  int ref_frame_index; /**< Ref frame index */
+} VpxTplBlockStats;
+
+/*!\brief Temporal dependency model stats for each frame before propagation */
+typedef struct VpxTplFrameStats {
+  int frame_width;  /**< Frame width */
+  int frame_height; /**< Frame height */
+  int num_blocks;   /**< Number of blocks. Size of block_stats_list */
+  VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} VpxTplFrameStats;
+
+/*!\brief Temporal dependency model stats for each GOP before propagation */
+typedef struct VpxTplGopStats {
+  int size; /**< GOP size, also the size of frame_stats_list. */
+  VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
+} VpxTplGopStats;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_VPX_TPL_H_

From d45cc8edda12306c8449242344c63992f63e7a0b Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 4 May 2023 10:48:25 -0400
Subject: [PATCH 730/926] Add IO for TPL stats

Overload TempOutFile constructor to allow IO mode.

Bug: b/281563704

Change-Id: I1f4f5b29db0e331941b6795e478eeeab51f625ad
---
 libs.mk                 |   2 +-
 test/encode_api_test.cc |  58 +++++++++++++++++++++-
 test/video_source.h     |  11 +++--
 vpx/exports_com         |   3 ++
 vpx/src/vpx_tpl.c       | 107 ++++++++++++++++++++++++++++++++++++++++
 vpx/vpx_codec.mk        |   1 +
 vpx/vpx_tpl.h           |  38 +++++++++++++-
 7 files changed, 212 insertions(+), 8 deletions(-)
 create mode 100644 vpx/src/vpx_tpl.c

diff --git a/libs.mk b/libs.mk
index ea5cc15a17..f6f6cc94c3 100644
--- a/libs.mk
+++ b/libs.mk
@@ -178,7 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h
 INSTALL-LIBS-yes += include/vpx/vpx_integer.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
-INSTALL-LIBS-$(CONFIG_VP9_ENCODER) += include/vpx/vpx_tpl.h
+INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 INSTALL-LIBS-yes                  += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index af98ad5ddd..e8a044ae17 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -11,6 +11,7 @@
 #include <climits>
 #include <cstring>
 #include <initializer_list>
+#include <new>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
@@ -368,7 +369,7 @@ class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
  public:
-  EncodeApiGetTplStatsTest() : EncoderTest(GetParam()) {}
+  EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {}
   ~EncodeApiGetTplStatsTest() override {}
 
  protected:
@@ -396,6 +397,34 @@ class EncodeApiGetTplStatsTest
     return VPX_CODEC_OK;
   }
 
+  void CompareTplGopStats(const VpxTplGopStats &ref_gop_stats,
+                          const VpxTplGopStats &test_gop_stats) {
+    ASSERT_EQ(ref_gop_stats.size, test_gop_stats.size);
+    for (int frame = 0; frame < ref_gop_stats.size; frame++) {
+      const VpxTplFrameStats &ref_frame_stats =
+          ref_gop_stats.frame_stats_list[frame];
+      const VpxTplFrameStats &test_frame_stats =
+          test_gop_stats.frame_stats_list[frame];
+      ASSERT_EQ(ref_frame_stats.num_blocks, test_frame_stats.num_blocks);
+      ASSERT_EQ(ref_frame_stats.frame_width, test_frame_stats.frame_width);
+      ASSERT_EQ(ref_frame_stats.frame_height, test_frame_stats.frame_height);
+      for (int block = 0; block < ref_frame_stats.num_blocks; block++) {
+        const VpxTplBlockStats &ref_block_stats =
+            ref_frame_stats.block_stats_list[block];
+        const VpxTplBlockStats &test_block_stats =
+            test_frame_stats.block_stats_list[block];
+        ASSERT_EQ(ref_block_stats.inter_cost, test_block_stats.inter_cost);
+        ASSERT_EQ(ref_block_stats.intra_cost, test_block_stats.intra_cost);
+        ASSERT_EQ(ref_block_stats.mv_c, test_block_stats.mv_c);
+        ASSERT_EQ(ref_block_stats.mv_r, test_block_stats.mv_r);
+        ASSERT_EQ(ref_block_stats.recrf_dist, test_block_stats.recrf_dist);
+        ASSERT_EQ(ref_block_stats.recrf_rate, test_block_stats.recrf_rate);
+        ASSERT_EQ(ref_block_stats.ref_frame_index,
+                  test_block_stats.ref_frame_index);
+      }
+    }
+  }
+
   void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
@@ -416,7 +445,21 @@ class EncodeApiGetTplStatsTest
             }
           }
           ASSERT_TRUE(stats_not_all_zero);
-          // Free the memory right away now as this is only a test.
+          if (test_io_ && tpl_stats.size > 0) {
+            libvpx_test::TempOutFile *temp_out_file =
+                new (std::nothrow) libvpx_test::TempOutFile("w+");
+            ASSERT_NE(temp_out_file, nullptr);
+            ASSERT_NE(temp_out_file->file(), nullptr);
+            vpx_write_tpl_gop_stats(temp_out_file->file(), &tpl_stats);
+            rewind(temp_out_file->file());
+            VpxTplGopStats gop_stats_io;
+            ASSERT_EQ(
+                vpx_read_tpl_gop_stats(temp_out_file->file(), &gop_stats_io),
+                VPX_CODEC_OK);
+            CompareTplGopStats(gop_stats_io, tpl_stats);
+            vpx_free_tpl_gop_stats(&gop_stats_io);
+            delete temp_out_file;
+          }
           free(tpl_stats.frame_stats_list);
           break;
         }
@@ -427,6 +470,7 @@ class EncodeApiGetTplStatsTest
 
   int width_;
   int height_;
+  bool test_io_;
 };
 
 TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
@@ -438,6 +482,16 @@ TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+TEST_P(EncodeApiGetTplStatsTest, GetTplStatsIO) {
+  cfg_.g_lag_in_frames = 25;
+  width_ = 352;
+  height_ = 288;
+  test_io_ = true;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
+                                       height_, 30, 1, 0, 50);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     VP9, EncodeApiGetTplStatsTest,
     ::testing::Values(
diff --git a/test/video_source.h b/test/video_source.h
index a10ff6fb09..5ed99d0639 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -64,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) {
   return fopen(path_to_source.c_str(), "rb");
 }
 
-static FILE *GetTempOutFile(std::string *file_name) {
+static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) {
   file_name->clear();
 #if defined(_WIN32)
   char fname[MAX_PATH];
@@ -73,7 +73,7 @@ static FILE *GetTempOutFile(std::string *file_name) {
     // Assume for now that the filename generated is unique per process
     if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
       file_name->assign(fname);
-      return fopen(fname, "wb+");
+      return fopen(fname, io_mode);
     }
   }
   return nullptr;
@@ -94,13 +94,16 @@ static FILE *GetTempOutFile(std::string *file_name) {
   const int fd = mkstemp(temp_file_name.get());
   if (fd == -1) return nullptr;
   *file_name = temp_file_name.get();
-  return fdopen(fd, "wb+");
+  return fdopen(fd, io_mode);
 #endif
 }
 
 class TempOutFile {
  public:
-  TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+  TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); }
+  TempOutFile(const char *io_mode) {
+    file_ = GetTempOutFile(&file_name_, io_mode);
+  }
   ~TempOutFile() {
     CloseFile();
     if (!file_name_.empty()) {
diff --git a/vpx/exports_com b/vpx/exports_com
index 2ab05099f8..f0b46aa175 100644
--- a/vpx/exports_com
+++ b/vpx/exports_com
@@ -14,3 +14,6 @@ text vpx_img_flip
 text vpx_img_free
 text vpx_img_set_rect
 text vpx_img_wrap
+text vpx_free_tpl_gop_stats
+text vpx_read_tpl_gop_stats
+text vpx_write_tpl_gop_stats
diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c
new file mode 100644
index 0000000000..9cdb4a0a06
--- /dev/null
+++ b/vpx/src/vpx_tpl.c
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_tpl.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define CHECK_FPRINTF_ERROR(expr) \
+  do {                            \
+    if (expr < 0) {               \
+      return VPX_CODEC_ERROR;     \
+    }                             \
+  } while (0)
+
+#define CHECK_FSCANF_ERROR(expr, expected_value) \
+  do {                                           \
+    if (expr != expected_value) {                \
+      return VPX_CODEC_ERROR;                    \
+    }                                            \
+  } while (0)
+
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+                                        const VpxTplGopStats *tpl_gop_stats) {
+  int i;
+  if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+  CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size));
+
+  for (i = 0; i < tpl_gop_stats->size; i++) {
+    VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i];
+    const int num_blocks = frame_stats.num_blocks;
+    int block;
+    CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width,
+                                frame_stats.frame_height, num_blocks));
+    for (block = 0; block < num_blocks; block++) {
+      VpxTplBlockStats block_stats = frame_stats.block_stats_list[block];
+      CHECK_FPRINTF_ERROR(
+          fprintf(tpl_file,
+                  "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
+                  " %" PRId64 " %d\n",
+                  block_stats.inter_cost, block_stats.intra_cost,
+                  block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
+                  block_stats.recrf_rate, block_stats.ref_frame_index));
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+                                       VpxTplGopStats *tpl_gop_stats) {
+  int i, frame_list_size;
+  if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+  CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1);
+  tpl_gop_stats->size = frame_list_size;
+  tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc(
+      frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0]));
+  if (tpl_gop_stats->frame_stats_list == NULL) {
+    return VPX_CODEC_MEM_ERROR;
+  }
+  for (i = 0; i < frame_list_size; i++) {
+    VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+    int num_blocks, width, height, block;
+    CHECK_FSCANF_ERROR(
+        fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3);
+    frame_stats->num_blocks = num_blocks;
+    frame_stats->frame_width = width;
+    frame_stats->frame_height = height;
+    frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc(
+        num_blocks, sizeof(frame_stats->block_stats_list[0]));
+    if (frame_stats->block_stats_list == NULL) {
+      vpx_free_tpl_gop_stats(tpl_gop_stats);
+      return VPX_CODEC_MEM_ERROR;
+    }
+    for (block = 0; block < num_blocks; block++) {
+      VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block];
+      CHECK_FSCANF_ERROR(
+          fscanf(tpl_file,
+                 "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64
+                 " %" SCNd64 " %d\n",
+                 &block_stats->inter_cost, &block_stats->intra_cost,
+                 &block_stats->mv_c, &block_stats->mv_r,
+                 &block_stats->recrf_dist, &block_stats->recrf_rate,
+                 &block_stats->ref_frame_index),
+          7);
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+void vpx_free_tpl_gop_stats(VpxTplGopStats *data) {
+  int frame;
+  if (data == NULL) return;
+  for (frame = 0; frame < data->size; frame++) {
+    vpx_free(data->frame_stats_list[frame].block_stats_list);
+  }
+  vpx_free(data->frame_stats_list);
+}
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index 4aec88b300..25c815ef51 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -37,6 +37,7 @@ API_SRCS-yes += internal/vpx_codec_internal.h
 API_SRCS-yes += internal/vpx_ratectrl_rtc.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
+API_SRCS-yes += src/vpx_tpl.c
 API_SRCS-yes += vpx_codec.h
 API_SRCS-yes += vpx_codec.mk
 API_SRCS-yes += vpx_frame_buffer.h
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
index 689fa96920..50aec49eb6 100644
--- a/vpx/vpx_tpl.h
+++ b/vpx/vpx_tpl.h
@@ -15,6 +15,8 @@
 #ifndef VPX_VPX_VPX_TPL_H_
 #define VPX_VPX_VPX_TPL_H_
 
+#include <stdio.h>
+
 #include "./vpx_integer.h"
 
 #ifdef __cplusplus
@@ -29,7 +31,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_TPL_ABI_VERSION (0) /**<\hideinitializer*/
+#define VPX_TPL_ABI_VERSION (1) /**<\hideinitializer*/
 
 /*!\brief Temporal dependency model stats for each block before propagation */
 typedef struct VpxTplBlockStats {
@@ -56,6 +58,40 @@ typedef struct VpxTplGopStats {
   VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
 } VpxTplGopStats;
 
+/*!\brief Write VpxTplGopStats to file
+ *
+ * Accepts an opened file handle and writes \p tpl_gop_stats.
+ *
+ * \param[in]    tpl_file       A FILE pointer that's already been opened.
+ * \param[in]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                              whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully written.
+ */
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+                                        const VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Read VpxTplGopStats from file
+ *
+ * Accepts an opened file handle and reads TPL stats and stores them into
+ * \p tpl_gop_stats. Allocates memory for TPL stats.
+ *
+ * \param[in]     tpl_file       A FILE pointer that's already been opened.
+ * \param[out]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                               whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully read from file.
+ */
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+                                       VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Free the memory allocated for VpxTplGopStats
+ *
+ * \param[in]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                              whole GOP.
+ */
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

From 62d09a3e94ef2ce0091b6a8e3a298851657c6891 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 23 May 2023 15:48:10 -0700
Subject: [PATCH 731/926] fdct8x8_test.cc: work around VS2022 Arm64 issue

cl.exe targeting AArch64 with optimizations enabled
produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck().
See:
https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786

Bug: b/277255076
Bug: webm:1788
Change-Id: Id2c60f3948d8f788c78602aea1b5232133415dea
---
 test/fdct8x8_test.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index fcc84690a0..21f8dcffa0 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -132,6 +132,15 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck().
+// See:
+// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
 class FwdTrans8x8TestBase {
  public:
   virtual ~FwdTrans8x8TestBase() {}
@@ -523,6 +532,9 @@ class FwdTrans8x8TestBase {
   vpx_bit_depth_t bit_depth_;
   int mask_;
 };
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
 
 class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Dct8x8Param> {

From 95b56ab7df669ec5dd29c283fa5bf6d38c2df5d1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 23 May 2023 15:49:29 -0700
Subject: [PATCH 732/926] fdct_partial_neon.c: work around VS2022 Arm64 issue

cl.exe targeting AArch64 with optimizations enabled
will fail with an internal compiler error.
See:
https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110

Bug: b/277255076
Bug: webm:1788
Change-Id: I55caf34e910dab47a7775f07280677cdfe606f5b
---
 vpx_dsp/arm/fdct_partial_neon.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c
index 718dba0d91..df0da543ce 100644
--- a/vpx_dsp/arm/fdct_partial_neon.c
+++ b/vpx_dsp/arm/fdct_partial_neon.c
@@ -37,6 +37,15 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   output[1] = 0;
 }
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// will fail with an internal compiler error.
+// See:
+// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   int r;
   int16x8_t sum = vld1q_s16(&input[0]);
@@ -49,6 +58,9 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   output[0] = (tran_low_t)horizontal_add_int16x8(sum);
   output[1] = 0;
 }
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
 
 void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
                           int stride) {

From 25f2e1ef255e89d5e7357aa2427926776327765a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 23 May 2023 15:50:10 -0700
Subject: [PATCH 733/926] vpx_dsp_common.h,clip_pixel: work around VS2022 Arm64
 issue

cl.exe targeting AArch64 with optimizations enabled
produces invalid code for clip_pixel() when the return type is uint8_t.
See:
https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361

Bug: b/277255076
Bug: webm:1788
Change-Id: Ia3647698effd34f1cf196cd33fa4a8cab9fa53d6
---
 vpx_dsp/vpx_dsp_common.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 2de4495465..4b946d7560 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -45,9 +45,21 @@ typedef int16_t tran_low_t;
 
 typedef int16_t tran_coef_t;
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// produces invalid code for clip_pixel() when the return type is uint8_t.
+// See:
+// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+static INLINE int clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+#else
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
+#endif
 
 static INLINE int clamp(int value, int low, int high) {
   return value < low ? low : (value > high ? high : value);

From c738e87f27ef8e12dd28b9052f446a5f69abf3c9 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 30 May 2023 15:22:04 +0100
Subject: [PATCH 734/926] Optimize Neon implementation of vpx_int_pro_col

Use widening pairwise addition instructions to halve the number of
additions required.

Change-Id: I0307a3b65e50d2b1ae582938bc5df9c2b21df734
---
 vpx_dsp/arm/avg_neon.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 8c61fc26f4..2fe65d1129 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -121,17 +121,17 @@ void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
 }
 
 int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  uint16x8_t sum;
   int i;
-  uint16x8_t vec_sum = vdupq_n_u16(0);
 
-  for (i = 0; i < width; i += 16) {
-    const uint8x16_t vec_row = vld1q_u8(ref);
-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
-    ref += 16;
+  assert(width >= 16 && width % 16 == 0);
+
+  sum = vpaddlq_u8(vld1q_u8(ref));
+  for (i = 16; i < width; i += 16) {
+    sum = vpadalq_u8(sum, vld1q_u8(ref + i));
   }
 
-  return (int16_t)horizontal_add_uint16x8(vec_sum);
+  return (int16_t)horizontal_add_uint16x8(sum);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits

From c36aa2e9c4a610dd7f5467126c894ac4dcbded02 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 30 May 2023 17:31:18 +0100
Subject: [PATCH 735/926] Optimize Neon implementation of vpx_int_pro_row

Double the number of accumulator registers to remove the bottleneck.
Also peel the first loop iteration.

Change-Id: I6a90680369f9c33cdfe14ea547ac1569ec3f50de
---
 vpx_dsp/arm/avg_neon.c | 89 +++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 2fe65d1129..22164242c5 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -70,54 +70,53 @@ int vpx_satd_neon(const tran_low_t *coeff, int length) {
 void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;
-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
-  const int shift_factor = ((height >> 5) + 3) * -1;
-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
-
-  for (i = 0; i < height; i += 8) {
-    const uint8x16_t vec_row1 = vld1q_u8(ref);
-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
-
-    ref += ref_stride * 8;
+  uint8x16_t r0, r1, r2, r3;
+  uint16x8_t sum_lo[2], sum_hi[2];
+  uint16x8_t tmp_lo[2], tmp_hi[2];
+  int16x8_t avg_lo, avg_hi;
+
+  const int norm_factor = (height >> 5) + 3;
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+
+  assert(height >= 4 && height % 4 == 0);
+
+  r0 = vld1q_u8(ref + 0 * ref_stride);
+  r1 = vld1q_u8(ref + 1 * ref_stride);
+  r2 = vld1q_u8(ref + 2 * ref_stride);
+  r3 = vld1q_u8(ref + 3 * ref_stride);
+
+  sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+  sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+  sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+  sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+  for (i = 4; i < height; i += 4) {
+    r0 = vld1q_u8(ref + 0 * ref_stride);
+    r1 = vld1q_u8(ref + 1 * ref_stride);
+    r2 = vld1q_u8(ref + 2 * ref_stride);
+    r3 = vld1q_u8(ref + 3 * ref_stride);
+
+    tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+    tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+    tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+    tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+    sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
+    sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
+    sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
+    sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
+
+    ref += 4 * ref_stride;
   }
 
-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+  sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+  sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+  avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+  avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
 
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+  vst1q_s16(hbuf, avg_lo);
+  vst1q_s16(hbuf + 8, avg_hi);
 }
 
 int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {

From 1aff4a5655855c20b6766a91b9dacc8d78279c92 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 26 May 2023 12:02:36 -0400
Subject: [PATCH 736/926] Trim tpl stats by 2 extra frames

Not applicable to the last GOP.

Bug: b/284162396
Change-Id: I55b7e04e9fc4b68a08ce3e00b10743823c828954
---
 vp9/encoder/vp9_tpl_model.c | 44 ++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 9f4bafdf83..8d203bbf4f 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -20,8 +20,8 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_tpl_model.h"
 
-static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
-                            const GF_GROUP *gf_group, int *tpl_group_frames) {
+static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                           const GF_GROUP *gf_group, int *tpl_group_frames) {
   VP9_COMMON *cm = &cpi->common;
   int frame_idx = 0;
   int i;
@@ -148,6 +148,8 @@ static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
     ++extend_frame_count;
     ++frame_gop_offset;
   }
+
+  return extend_frame_count;
 }
 
 static void init_tpl_stats(VP9_COMP *cpi) {
@@ -1245,6 +1247,35 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
   }
 }
 
+static void trim_tpl_stats(struct vpx_internal_error_info *error_info,
+                           VpxTplGopStats *tpl_gop_stats, int extra_frames) {
+  int i;
+  VpxTplFrameStats *new_frame_stats;
+  const int new_size = tpl_gop_stats->size - extra_frames;
+  if (tpl_gop_stats->size <= extra_frames)
+    vpx_internal_error(
+        error_info, VPX_CODEC_ERROR,
+        "The number of frames in VpxTplGopStats is fewer than expected.");
+  CHECK_MEM_ERROR(error_info, new_frame_stats,
+                  vpx_calloc(new_size, sizeof(*new_frame_stats)));
+  for (i = 0; i < new_size; i++) {
+    VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+    const int num_blocks = frame_stats->num_blocks;
+    new_frame_stats[i].num_blocks = frame_stats->num_blocks;
+    new_frame_stats[i].frame_width = frame_stats->frame_width;
+    new_frame_stats[i].frame_height = frame_stats->frame_height;
+    new_frame_stats[i].num_blocks = num_blocks;
+    CHECK_MEM_ERROR(
+        error_info, new_frame_stats[i].block_stats_list,
+        vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list)));
+    memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list,
+           num_blocks * sizeof(*new_frame_stats[i].block_stats_list));
+  }
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  tpl_gop_stats->size = new_size;
+  tpl_gop_stats->frame_stats_list = new_frame_stats;
+}
+
 #if CONFIG_NON_GREEDY_MV
 #define DUMP_TPL_STATS 0
 #if DUMP_TPL_STATS
@@ -1456,9 +1487,11 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
   int tpl_group_frames = 0;
   int frame_idx;
+  int extended_frame_count;
   cpi->tpl_bsize = BLOCK_32X32;
 
-  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
+  extended_frame_count =
+      init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
 
   init_tpl_stats(cpi);
 
@@ -1470,6 +1503,11 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
     if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
     mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
   }
+
+  // TPL stats has extra frames from next GOP. Trim those extra frames for
+  // Qmode.
+  trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
+
 #if CONFIG_NON_GREEDY_MV
   cpi->tpl_ready = 1;
 #if DUMP_TPL_STATS

From 7b66c730a2edd3e232dce5e8ef2522ff83928a90 Mon Sep 17 00:00:00 2001
From: Deepa K G <deepa.kg@ittiam.com>
Date: Mon, 24 Apr 2023 15:56:18 +0530
Subject: [PATCH 737/926] Fix c vs avx mismatch of diamond_search_sad()

In the function vp9_diamond_search_sad_avx(), arranged
the cost vector in a specific order. This ensures that
the motion vector with the least index is selected,
when there exists more than one candidate motion
vector with the minimum cost, thus resolving the
c vs avx mismatch.

STATS_CHANGED

Change-Id: I4f8864f464f9ea2aae6250db3d8ad91cb08b26e2
---
 vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 80442e3594..c00579edc0 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -233,8 +233,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
         if (UNLIKELY(local_best_sad == 0xffff)) {
           __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
 
-          v_loval_d = v_sad_d;
-          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
+          v_loval_d = _mm_shuffle_epi32(v_sad_d, 0xd8);
+          v_loidx_d = _mm_set_epi32(3, 1, 2, 0);
+
           v_hival_d = _mm_srli_si128(v_loval_d, 8);
           v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
 

From e510716d7e9a0a34592eb8ff1f8a65b951fe2eeb Mon Sep 17 00:00:00 2001
From: Deepa K G <deepa.kg@ittiam.com>
Date: Tue, 6 Jun 2023 11:38:09 +0530
Subject: [PATCH 738/926] Add comments in vp9_diamond_search_sad_avx()

Added comments related to re-arranging the
elements of the SAD vector to find the
minimum.

Change-Id: I58b702d304a6cdd32f04775fba603e39c19a8947
---
 vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index c00579edc0..63c35df09e 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -233,12 +233,19 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
         if (UNLIKELY(local_best_sad == 0xffff)) {
           __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
 
+          // Re-arrange the values in v_sad_d as follows:
+          // v_loval_d[0] = v_sad_d[0], v_loval_d[1] = v_sad_d[2]
+          // v_loval_d[2] = v_sad_d[1], v_loval_d[3] = v_sad_d[3]
+          // v_loidx_d stores the corresponding indices 0, 2, 1, 3
+          // This re-arrangement is required to ensure that when there exists
+          // more than one minimum, the one with the least index is selected
           v_loval_d = _mm_shuffle_epi32(v_sad_d, 0xd8);
           v_loidx_d = _mm_set_epi32(3, 1, 2, 0);
 
           v_hival_d = _mm_srli_si128(v_loval_d, 8);
           v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
 
+          // Compare if v_sad_d[1] < v_sad_d[0], v_sad_d[3] < v_sad_d[2]
           v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
 
           v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
@@ -246,6 +253,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
           v_hival_d = _mm_srli_si128(v_loval_d, 4);
           v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
 
+          // min(v_sad_d[2], v_sad_d[3]) < min(v_sad_d[0], v_sad_d[1])
           v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
 
           v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);

From bcd491a6be2b163c7293674dd91d8ca1f4cb56f0 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 7 Jun 2023 12:21:38 -0400
Subject: [PATCH 739/926] Fix a few typos

segement -> segment
dont -> don't
useage -> usage
devide -> divide

Bug: webm:1803
Change-Id: I0153380b0003825c4b62cf323d4f2bc837c8a264
---
 test/set_roi.cc                 |  2 +-
 vp8/common/blockd.h             |  2 +-
 vp8/common/vp8_loopfilter.c     |  2 +-
 vp8/decoder/decodeframe.c       |  6 +++---
 vp8/decoder/threading.c         |  2 +-
 vp8/encoder/bitstream.c         |  2 +-
 vp8/encoder/ethreading.c        |  2 +-
 vp8/encoder/firstpass.c         |  6 +++---
 vp8/encoder/onyx_if.c           | 14 ++++++------
 vp8/encoder/ratectrl.c          | 38 ++++++++++++++++-----------------
 vp8/encoder/rdopt.c             |  2 +-
 vp8/encoder/segmentation.c      |  4 ++--
 vp8/encoder/segmentation.h      |  4 ++--
 vp8/encoder/vp8_quantize.c      |  2 +-
 vp9/decoder/vp9_decodemv.c      |  2 +-
 vp9/encoder/vp9_aq_complexity.c |  2 +-
 vp9/encoder/vp9_firstpass.c     | 16 +++++++-------
 vp9/encoder/vp9_ratectrl.c      |  4 ++--
 vp9/encoder/vp9_rdopt.c         |  2 +-
 19 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/test/set_roi.cc b/test/set_roi.cc
index 167cf908fd..693410e391 100644
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -40,7 +40,7 @@ TEST(VP8RoiMapTest, ParameterCheck) {
 
   // Initialize elements of cpi with valid defaults.
   VP8_COMP cpi;
-  cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA;
+  cpi.mb.e_mbd.mb_segment_abs_delta = SEGMENT_DELTADATA;
   cpi.cyclic_refresh_mode_enabled = 0;
   cpi.mb.e_mbd.segmentation_enabled = 0;
   cpi.mb.e_mbd.update_mb_segmentation_map = 0;
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 405443449d..8300aad941 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -251,7 +251,7 @@ typedef struct macroblockd {
   unsigned char update_mb_segmentation_data;
 
   /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
-  unsigned char mb_segement_abs_delta;
+  unsigned char mb_segment_abs_delta;
 
   /* Per frame flags that define which MB level features (such as quantizer or
    * loop filter level) */
diff --git a/vp8/common/vp8_loopfilter.c b/vp8/common/vp8_loopfilter.c
index 9c9e5f351b..4576c18537 100644
--- a/vp8/common/vp8_loopfilter.c
+++ b/vp8/common/vp8_loopfilter.c
@@ -111,7 +111,7 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
 
     /* Note the baseline filter values for each segment */
     if (mbd->segmentation_enabled) {
-      if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+      if (mbd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
         lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
       } else { /* Delta Value */
         lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 1c1566766b..d014cf9667 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -63,7 +63,7 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   /* Decide whether to use the default or alternate baseline Q value. */
   if (xd->segmentation_enabled) {
     /* Abs Value */
-    if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
       QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
 
       /* Delta Value */
@@ -829,7 +829,7 @@ static void init_frame(VP8D_COMP *pbi) {
 
     /* reset the segment feature data to 0 with delta coding (Default state). */
     memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
-    xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
 
     /* reset the mode ref deltasa for loop filter */
     memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
@@ -995,7 +995,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
     xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
 
     if (xd->update_mb_segmentation_data) {
-      xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+      xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc);
 
       memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
 
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 9ea6a4f34a..6ccb080cf9 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -56,7 +56,7 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     mbd->dst = xd->dst;
 
     mbd->segmentation_enabled = xd->segmentation_enabled;
-    mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+    mbd->mb_segment_abs_delta = xd->mb_segment_abs_delta;
     memcpy(mbd->segment_feature_data, xd->segment_feature_data,
            sizeof(xd->segment_feature_data));
 
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 190b013afd..03691fc9d1 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1080,7 +1080,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
     if (xd->update_mb_segmentation_data) {
       signed char Data;
 
-      vp8_write_bit(bc, xd->mb_segement_abs_delta);
+      vp8_write_bit(bc, xd->mb_segment_abs_delta);
 
       /* For each segmentation feature (Quant and loop filter level) */
       for (i = 0; i < MB_LVL_MAX; ++i) {
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 2583cb0ac3..b7f1932c58 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -402,7 +402,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
     zd->subpixel_predict8x8 = xd->subpixel_predict8x8;
     zd->subpixel_predict16x16 = xd->subpixel_predict16x16;
     zd->segmentation_enabled = xd->segmentation_enabled;
-    zd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+    zd->mb_segment_abs_delta = xd->mb_segment_abs_delta;
     memcpy(zd->segment_feature_data, xd->segment_feature_data,
            sizeof(xd->segment_feature_data));
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 4149fb4bf8..0abd178a61 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1788,7 +1788,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   /* Should we use the alternate refernce frame */
   if (allow_alt_ref && (i >= MIN_GF_INTERVAL) &&
-      /* dont use ARF very near next kf */
+      /* don't use ARF very near next kf */
       (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
 #if NEW_BOOST
       ((next_frame.pcnt_inter > 0.75) || (next_frame.pcnt_second_ref > 0.5)) &&
@@ -2123,7 +2123,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0;
 
     /* This condition could fail if there are two kfs very close together
-     * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+     * despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
      * calculation of cpi->twopass.alt_extra_bits.
      */
     if (cpi->baseline_gf_interval >= 3) {
@@ -2393,7 +2393,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
   }
 
   /* The last few frames of a clip almost always have to few or too many
-   * bits and for the sake of over exact rate control we dont want to make
+   * bits and for the sake of over exact rate control we don't want to make
    * radical adjustments to the allowed quantizer range just to use up a
    * few surplus bits or get beneath the target rate.
    */
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 8941329419..e78743e496 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -488,7 +488,7 @@ static void set_segmentation_map(VP8_COMP *cpi,
  */
 static void set_segment_data(VP8_COMP *cpi, signed char *feature_data,
                              unsigned char abs_delta) {
-  cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
   memcpy(cpi->segment_feature_data, feature_data,
          sizeof(cpi->segment_feature_data));
 }
@@ -2751,7 +2751,7 @@ static int decide_key_frame(VP8_COMP *cpi) {
   }
   /* in addition if the following are true and this is not a golden frame
    * then code a key frame Note that on golden frames there often seems
-   * to be a pop in intra useage anyway hence this restriction is
+   * to be a pop in intra usage anyway hence this restriction is
    * designed to prevent spurious key frames. The Intra pop needs to be
    * investigated.
    */
@@ -3637,7 +3637,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         Q = cpi->avg_frame_qindex;
       }
 
-      /* For constrained quality dont allow Q less than the cq level */
+      /* For constrained quality don't allow Q less than the cq level */
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
           (Q < cpi->cq_target_quality)) {
         Q = cpi->cq_target_quality;
@@ -3664,7 +3664,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     } else {
       cpi->active_best_quality = inter_minq[Q];
 
-      /* For the constant/constrained quality mode we dont want
+      /* For the constant/constrained quality mode we don't want
        * q to fall below the cq level.
        */
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
@@ -3685,7 +3685,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
      * higher quality on the frames to prevent bits just going to waste.
      */
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-      /* Note that the use of >= here elliminates the risk of a devide
+      /* Note that the use of >= here elliminates the risk of a divide
        * by 0 error in the else if clause
        */
       if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) {
@@ -4322,12 +4322,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   vp8_cal_dissimilarity(cpi);
 #endif
 
-  /* Update the GF useage maps.
+  /* Update the GF usage maps.
    * This is done after completing the compression of a frame when all
    * modes etc. are finalized but before loop filter
    */
   if (cpi->oxcf.number_of_layers == 1) {
-    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+    vp8_update_gf_usage_maps(cpi, cm, &cpi->mb);
   }
 
   if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1;
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 9cd3963e22..49ab4aa238 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -388,7 +388,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
       (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
   int Boost = 0;
 
-  int gf_frame_useage = 0; /* Golden frame useage since last GF */
+  int gf_frame_usage = 0; /* Golden frame usage since last GF */
   int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
                 cpi->recent_ref_frame_usage[LAST_FRAME] +
                 cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -398,12 +398,12 @@ static void calc_gf_params(VP8_COMP *cpi) {
                       (cpi->common.mb_rows * cpi->common.mb_cols);
 
   if (tot_mbs) {
-    gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                       cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                      100 / tot_mbs;
+    gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                      cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                     100 / tot_mbs;
   }
 
-  if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+  if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active;
 
   /* Not two pass */
   if (cpi->pass != 2) {
@@ -467,7 +467,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
       /* Adjust boost based upon ambient Q */
       Boost = GFQ_ADJUSTMENT;
 
-      /* Adjust based upon most recently measure intra useage */
+      /* Adjust based upon most recently measure intra usage */
       Boost = Boost *
               gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15)
                                             ? cpi->this_frame_percent_intra
@@ -475,7 +475,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
               100;
 
       /* Adjust gf boost based upon GF usage since last GF */
-      Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
+      Boost = Boost * gf_adjust_table[gf_frame_usage] / 100;
 #endif
     }
 
@@ -516,8 +516,8 @@ static void calc_gf_params(VP8_COMP *cpi) {
 
       if (cpi->last_boost >= 1500) cpi->frames_till_gf_update_due++;
 
-      if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due) {
-        cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];
+      if (gf_interval_table[gf_frame_usage] > cpi->frames_till_gf_update_due) {
+        cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_usage];
       }
 
       if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) {
@@ -895,7 +895,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
       int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
                                       : cpi->oxcf.fixed_q;
 
-      int gf_frame_useage = 0; /* Golden frame useage since last GF */
+      int gf_frame_usage = 0; /* Golden frame usage since last GF */
       int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
                     cpi->recent_ref_frame_usage[LAST_FRAME] +
                     cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -905,20 +905,20 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
                           (cpi->common.mb_rows * cpi->common.mb_cols);
 
       if (tot_mbs) {
-        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                           cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                          100 / tot_mbs;
+        gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                          cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                         100 / tot_mbs;
       }
 
-      if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+      if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active;
 
       /* Is a fixed manual GF frequency being used */
       if (cpi->auto_gold) {
-        /* For one pass throw a GF if recent frame intra useage is
-         * low or the GF useage is high
+        /* For one pass throw a GF if recent frame intra usage is
+         * low or the GF usage is high
          */
         if ((cpi->pass == 0) &&
-            (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) {
+            (cpi->this_frame_percent_intra < 15 || gf_frame_usage >= 5)) {
           cpi->common.refresh_golden_frame = 1;
 
           /* Two pass GF descision */
@@ -933,10 +933,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           if (0) {
               FILE *f;
 
-              f = fopen("gf_useaget.stt", "a");
+              f = fopen("gf_usaget.stt", "a");
               fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
                       cpi->common.current_video_frame,  cpi->gfu_boost,
-                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_usage);
               fclose(f);
           }
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index bbddacf8f0..7cd42d107c 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1979,7 +1979,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.distortion2 += distortion;
 
         /* If even the 'Y' rd value of split is higher than best so far
-         * then dont bother looking at UV
+         * then don't bother looking at UV
          */
         if (tmp_rd < best_mode.yrd) {
           /* Now work out UV cost and add it in */
diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index dcb68119e1..2127258111 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -11,7 +11,7 @@
 #include "segmentation.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
+void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
   int mb_row, mb_col;
 
   MODE_INFO *this_mb_mode_info = cm->mi;
@@ -19,7 +19,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
   x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
 
   if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
-    /* Reset Gf useage monitors */
+    /* Reset Gf usage monitors */
     memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
   } else {
diff --git a/vp8/encoder/segmentation.h b/vp8/encoder/segmentation.h
index 4ddbdbbd26..0fecfc2212 100644
--- a/vp8/encoder/segmentation.h
+++ b/vp8/encoder/segmentation.h
@@ -19,8 +19,8 @@
 extern "C" {
 #endif
 
-extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
-                                      MACROBLOCK *x);
+extern void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
+                                     MACROBLOCK *x);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/encoder/vp8_quantize.c b/vp8/encoder/vp8_quantize.c
index 5b89555108..8e5e318241 100644
--- a/vp8/encoder/vp8_quantize.c
+++ b/vp8/encoder/vp8_quantize.c
@@ -294,7 +294,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) {
   /* Select the baseline MB Q index. */
   if (xd->segmentation_enabled) {
     /* Abs Value */
-    if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
       QIndex = xd->segment_feature_data[MB_LVL_ALT_Q]
                                        [xd->mode_info_context->mbmi.segment_id];
       /* Delta Value */
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index db3e746639..22b62e6a2d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -708,7 +708,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     mi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
       vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                         "Invalid usage of segement feature on small blocks");
+                         "Invalid usage of segment feature on small blocks");
       return;
     }
   } else {
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index bd3812036c..ef3423f8eb 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -87,7 +87,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
           &cpi->rc, cm->frame_type, cm->base_qindex,
           aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
 
-      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // For AQ complexity mode, we don't allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
       // Q delta is sometimes applied without going back around the rd loop.
       // This could lead to an illegal combination of partition size and q.
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 8fdd976816..d97bf2a1c9 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1700,7 +1700,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
     }
 
     // Second scan using clamps based on the previous cycle average.
-    // This may modify the total and average somewhat but we dont bother with
+    // This may modify the total and average somewhat but we don't bother with
     // further itterations.
     modified_score_total = 0.0;
     s = twopass->stats_in;
@@ -1858,7 +1858,7 @@ static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) {
   // brief break in prediction (such as a flash) but subsequent frames
   // are reasonably well predicted by an earlier (pre flash) frame.
   // The recovery after a flash is indicated by a high pcnt_second_ref
-  // useage or a second ref coded error notabley lower than the last
+  // usage or a second ref coded error notabley lower than the last
   // frame coded error.
   if (frame_stats == NULL) {
     return 0;
@@ -2169,7 +2169,7 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score,
   double score_total = 0.0;
   int i = 0;
 
-  // We dont ever want to return a 0 score here.
+  // We don't ever want to return a 0 score here.
   if (frame_count == 0) return 1.0;
 
   while ((i < frame_count) && (s < twopass->stats_in_end)) {
@@ -2597,7 +2597,7 @@ static int get_gop_coding_frame_num(
     if (
         // Don't break out with a very short interval.
         (gop_coding_frames >= active_gf_interval->min) &&
-        // If possible dont break very close to a kf
+        // If possible don't break very close to a kf
         ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) &&
         (gop_coding_frames & 0x01) && (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
@@ -3031,7 +3031,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
       next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error);
 
   // Return true the intra/inter ratio for the current frame is
-  // low but better in the next and previous frame and the relative useage of
+  // low but better in the next and previous frame and the relative usage of
   // intra in the current frame is markedly higher than the last and next frame.
   if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) &&
       (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) &&
@@ -3052,7 +3052,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
 // Minimum % intra coding observed in first pass (1.0 = 100%)
 #define MIN_INTRA_LEVEL 0.25
 // Threshold for use of the lagging second reference frame. Scene cuts do not
-// usually have a high second ref useage.
+// usually have a high second ref usage.
 #define SECOND_REF_USEAGE_THRESH 0.2
 // Hard threshold where the first pass chooses intra for almost all blocks.
 // In such a case even if the frame is not a scene cut coding a key frame
@@ -3391,7 +3391,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   twopass->key_frame_section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
-  // Special case for static / slide show content but dont apply
+  // Special case for static / slide show content but don't apply
   // if the kf group is very short.
   if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
     rc->kf_boost = (int)(twopass->kf_max_total_boost);
@@ -3523,7 +3523,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     vp9_init_vizier_params(twopass, screen_area);
   }
 
-  // If this is an arf frame then we dont want to read the stats file or
+  // If this is an arf frame then we don't want to read the stats file or
   // advance the input pointer as we already have what we need.
   if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
     int target_rate;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 13b43aa63a..4e5fdc6d90 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1196,7 +1196,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
     } else {
       q = rc->avg_frame_qindex[KEY_FRAME];
     }
-    // For constrained quality dont allow Q less than the cq level
+    // For constrained quality don't allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
 
@@ -1457,7 +1457,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
     } else {
       q = active_worst_quality;
     }
-    // For constrained quality dont allow Q less than the cq level
+    // For constrained quality don't allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
     }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 464705a678..c0d8b505fe 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -4671,7 +4671,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
 
       if (tmp_best_rdu > 0) {
         // If even the 'Y' rd value of split is higher than best so far
-        // then dont bother looking at UV
+        // then don't bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8);
         memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,

From ad14a32b33ac94e56fdd84ba06c1c9c9c032d004 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 7 Jun 2023 12:26:55 -0400
Subject: [PATCH 740/926] Fix more typos (1/n)

Dont -> Don't
setings -> settings
thresold -> thresh
thresold -> threshold
becasue -> because
itterations -> iterations
its a -> it's a
an constant -> a constant

Bug: webm:1803
Change-Id: I1e019393939ed25c59c898c88d4941ec360b026d
---
 vp8/encoder/firstpass.c                            | 12 ++++++------
 vp8/encoder/onyx_if.c                              |  6 +++---
 vp8/encoder/onyx_int.h                             |  2 +-
 vp8/encoder/pickinter.c                            |  2 +-
 vp8/vp8_cx_iface.c                                 |  4 ++--
 vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c |  2 +-
 vp9/encoder/vp9_block.h                            |  2 +-
 vp9/encoder/vp9_firstpass.c                        |  4 ++--
 vp9/encoder/vp9_ratectrl.c                         |  4 ++--
 vp9/encoder/vp9_speed_features.h                   |  2 +-
 vp9/vp9_cx_iface.c                                 |  4 ++--
 11 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 0abd178a61..f141e7d057 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -412,7 +412,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
   int_mv ref_mv_full;
 
   int tmp_err;
-  int step_param = 3; /* Dont search over full range for first pass */
+  int step_param = 3; /* Don't search over full range for first pass */
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
   vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
@@ -1717,9 +1717,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         /* Break at cpi->max_gf_interval unless almost totally static */
         (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
         (
-            /* Dont break out with a very short interval */
+            /* Don't break out with a very short interval */
             (i > MIN_GF_INTERVAL) &&
-            /* Dont break out very close to a key frame */
+            /* Don't break out very close to a key frame */
             ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
             ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
             (!flash_detected) &&
@@ -1765,7 +1765,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (boost_score > max_boost) boost_score = max_boost;
   }
 
-  /* Dont allow conventional gf too near the next kf */
+  /* Don't allow conventional gf too near the next kf */
   if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < cpi->twopass.frames_to_key) {
       i++;
@@ -2082,7 +2082,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       }
     }
 
-    /* Dont allow a negative value for gf_bits */
+    /* Don't allow a negative value for gf_bits */
     if (gf_bits < 0) gf_bits = 0;
 
     /* Add in minimum for a frame */
@@ -3011,7 +3011,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       bits_per_frame =
           (double)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key);
 
-      /* Dont turn to resampling in easy sections just because they
+      /* Don't turn to resampling in easy sections just because they
        * have been assigned a small number of bits
        */
       if (bits_per_frame < av_bits_per_frame) {
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index e78743e496..b189632757 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -4484,7 +4484,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
        * size within range) then use the last frame value - 1. The -1
        * is designed to stop Q and hence the data rate, from
        * progressively falling away during difficult sections, but at
-       * the same time reduce the number of itterations around the
+       * the same time reduce the number of iterations around the
        * recode loop.
        */
       if (Q > cpi->ni_av_qi) cpi->ni_av_qi = Q - 1;
@@ -4731,7 +4731,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   cpi->mb.e_mbd.update_mb_segmentation_data = 0;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
 
-  /* Dont increment frame counters if this was an altref buffer update
+  /* Don't increment frame counters if this was an altref buffer update
    * not a real frame
    */
   if (cm->show_frame) {
@@ -5109,7 +5109,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
   if (cm->refresh_last_frame) memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
 
-  /* if its a dropped frame honor the requests on subsequent frames */
+  /* if it's a dropped frame honor the requests on subsequent frames */
   if (*size > 0) {
     cpi->droppable = !frame_is_reference(cpi);
 
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index bde5c2f69b..4304f054ca 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -360,7 +360,7 @@ typedef struct VP8_COMP {
   /* GF interval chosen when we coded the last GF */
   int current_gf_interval;
 
-  /* Total bits overspent becasue of GF boost (cumulative) */
+  /* Total bits overspent because of GF boost (cumulative) */
   int gf_overspend_bits;
 
   /* Used in the few frames following a GF to recover the extra bits
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 04f68c3245..1af8a2f9b2 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -1103,7 +1103,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
       /* Store for later use by denoiser. */
-      // Dont' denoise with GOLDEN OR ALTREF is they are old reference
+      // Don't denoise with GOLDEN OR ALTREF is they are old reference
       // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
       int skip_old_reference = ((this_ref_frame != LAST_FRAME) &&
                                 (cpi->common.current_video_frame -
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 0821eef026..8950de0d8a 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1292,8 +1292,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         0,  /* rc_resize_allowed */
         1,  /* rc_scaled_width */
         1,  /* rc_scaled_height */
-        60, /* rc_resize_down_thresold */
-        30, /* rc_resize_up_thresold */
+        60, /* rc_resize_down_thresh */
+        30, /* rc_resize_up_thresh */
 
         VPX_VBR,     /* rc_end_usage */
         { NULL, 0 }, /* rc_twopass_stats_in */
diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
index aeb7e49c10..b43d7fa4f9 100644
--- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
@@ -18,7 +18,7 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-// Use macros to make sure argument lane is passed in as an constant integer.
+// Use macros to make sure argument lane is passed in as a constant integer.
 
 #define vmull_lane_s32_dual(in, c, lane, out)                          \
   do {                                                                 \
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 4d336f2a42..f4653a82fb 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -34,7 +34,7 @@ struct macroblock_plane {
   uint16_t *eobs;
   struct buf_2d src;
 
-  // Quantizer setings
+  // Quantizer settings
   int16_t *round_fp;
   int16_t *quant_fp;
   int16_t *quant;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index d97bf2a1c9..e4c8a0e4a1 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1448,7 +1448,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
     }
 
-    // Dont allow a value of 0 for duration.
+    // Don't allow a value of 0 for duration.
     // (Section duration is also defaulted to minimum of 1.0).
     fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start));
 
@@ -1701,7 +1701,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 
     // Second scan using clamps based on the previous cycle average.
     // This may modify the total and average somewhat but we don't bother with
-    // further itterations.
+    // further iterations.
     modified_score_total = 0.0;
     s = twopass->stats_in;
     while (s < twopass->stats_in_end) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 4e5fdc6d90..c32745b4f8 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1359,7 +1359,7 @@ static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index,
       active_best_quality /= 4;
     }
 
-    // Dont allow the active min to be lossless (q0) unlesss the max q
+    // Don't allow the active min to be lossless (q0) unlesss the max q
     // already indicates lossless.
     active_best_quality =
         VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality));
@@ -2693,7 +2693,7 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
-  // Dont do it for kf,arf,gf or overlay frames.
+  // Don't do it for kf,arf,gf or overlay frames.
   if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
       rc->vbr_bits_off_target_fast) {
     int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 70c61fe00d..941de639ac 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -643,7 +643,7 @@ typedef struct SPEED_FEATURES {
   // Use machine learning based partition search.
   int nonrd_use_ml_partition;
 
-  // Multiplier for base thresold for variance partitioning.
+  // Multiplier for base threshold for variance partitioning.
   int variance_part_thresh_mult;
 
   // Force subpel motion filter to always use SMOOTH_FILTER.
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 409069b4ed..5873f30e89 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -2084,8 +2084,8 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         0,   // rc_resize_allowed
         0,   // rc_scaled_width
         0,   // rc_scaled_height
-        60,  // rc_resize_down_thresold
-        30,  // rc_resize_up_thresold
+        60,  // rc_resize_down_thresh
+        30,  // rc_resize_up_thresh
 
         VPX_VBR,      // rc_end_usage
         { NULL, 0 },  // rc_twopass_stats_in

From ffb93451095809b710576194c374c84c1d36d4cd Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 7 Jun 2023 12:31:38 -0400
Subject: [PATCH 741/926] Fix more typos (2/n)

kernal -> kernel
e.g -> e.g.
paritioning -> partitioning
partioning -> partitioning
coefficents -> coefficients
i.e, -> i.e.,
equivalend -> equivalent
recive -> receive
resoultions -> resolutions

Bug: webm:1803
Change-Id: I1d6176202ee5daee7a64bf59114e8b304aeb4db7
---
 vp8/encoder/onyx_if.c            |  2 +-
 vp9/encoder/vp9_block.h          |  6 ++--
 vp9/encoder/vp9_denoiser.c       |  2 +-
 vp9/encoder/vp9_encodeframe.c    | 26 ++++++++---------
 vp9/encoder/vp9_encoder.c        | 48 ++++++++++++++++----------------
 vp9/encoder/vp9_firstpass.c      | 28 +++++++++----------
 vp9/encoder/vp9_noise_estimate.c |  2 +-
 vp9/simple_encode.h              |  2 +-
 vpx/vp8cx.h                      |  4 +--
 9 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index b189632757..c65afc643b 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2251,7 +2251,7 @@ void vp8_remove_compressor(VP8_COMP **comp) {
 #if 0
         {
             printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-            printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+            printf("\n_frames receive_data encod_mb_row compress_frame  Total\n");
             printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
         }
 #endif
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index f4653a82fb..7fa00cd194 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -85,10 +85,10 @@ struct macroblock {
   // The equivalent error at the current rdmult of one whole bit (not one
   // bitcost unit).
   int errorperbit;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // The equivalent SAD error of one (whole) bit at the current quantizer
   // for large blocks.
   int sadperbit16;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // The equivalent SAD error of one (whole) bit at the current quantizer
   // for sub-8x8 blocks.
   int sadperbit4;
   int rddiv;
@@ -128,7 +128,7 @@ struct macroblock {
   // from extending outside the UMV borders
   MvLimits mv_limits;
 
-  // Notes transform blocks where no coefficents are coded.
+  // Notes transform blocks where no coefficients are coded.
   // Set during mode selection. Read during block encoding.
   uint8_t zcoeff_blk[TX_SIZES][256];
 
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index baea8ebb3c..b40d5c6154 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -387,7 +387,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
           consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv);
           // No need to keep checking 8x8 blocks if any of the sub-blocks
           // has small consec_zeromv (since threshold for no_skin based on
-          // zero/small motion in skin detection is high, i.e, > 4).
+          // zero/small motion in skin detection is high, i.e., > 4).
           if (consec_zeromv < 4) {
             i = ymis;
             break;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a979ae1c93..2381c73687 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1706,7 +1706,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           const int y16_idx = ((j >> 1) << 1);
           // For inter frames: if variance4x4downsample[] == 1 for this 16x16
           // block, then the variance is based on 4x4 down-sampling, so use vt2
-          // in set_vt_partioning(), otherwise use vt.
+          // in set_vt_partitioning(), otherwise use vt.
           v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1)
                               ? &vt2[i2 + j]
                               : &vt.split[i].split[j];
@@ -3470,11 +3470,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
 // Features used: QP; spatial block size contexts; variance of prediction
 // residue after simple_motion_search.
 #define FEATURES 12
-static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
-                                          MACROBLOCK *const x,
-                                          PC_TREE *const pc_tree,
-                                          BLOCK_SIZE bsize, int mi_row,
-                                          int mi_col, int *none, int *split) {
+static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi,
+                                           MACROBLOCK *const x,
+                                           PC_TREE *const pc_tree,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, int *none, int *split) {
   const VP9_COMMON *const cm = &cpi->common;
   const NN_CONFIG *nn_config = NULL;
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -4092,8 +4092,8 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
         mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
         mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
     if (do_rd_ml_partition_var_pruning) {
-      ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
-                                    &partition_none_allowed, &do_split);
+      ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
+                                     &partition_none_allowed, &do_split);
     } else {
       vp9_zero(pc_tree->mv);
     }
@@ -4820,9 +4820,9 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
 
 #define FEATURES 6
 #define LABELS 2
-static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
-                                      BLOCK_SIZE bsize, int mi_row,
-                                      int mi_col) {
+static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, int mi_row,
+                                       int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   const NN_CONFIG *nn_config = NULL;
 
@@ -4954,7 +4954,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     if (partition_none_allowed || do_split) do_rect = 0;
     if (partition_none_allowed && do_split) {
       const int ml_predicted_partition =
-          ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col);
+          ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
       if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
       if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
     }
@@ -5633,7 +5633,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
              cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
-          // Use lower max_partition_size for low resoultions.
+          // Use lower max_partition_size for low resolutions.
           if (cm->width <= 352 && cm->height <= 288)
             x->max_partition_size = BLOCK_32X32;
           else
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 9d5c0030a2..d03d87a8a1 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -139,7 +139,7 @@ static int compute_context_model_thresh(const VP9_COMP *const cpi) {
   // frame context probability model is less than a certain threshold.
   // The first component is the most critical part to guarantee adaptivity.
   // Other parameters are estimated based on normal setting of hd resolution
-  // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
+  // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
   const int thresh =
       ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) *
        qindex_factor) >>
@@ -2836,7 +2836,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 #if 0
     {
       printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("\n_frames receive_data encod_mb_row compress_frame  Total\n");
       printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
              cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
              cpi->time_compress_data / 1000,
@@ -5020,8 +5020,8 @@ static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
 
 #ifdef ENABLE_KF_DENOISE
 // Baseline kernel weights for denoise
-static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
-static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
+static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
+static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
                                    2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 };
 
 static INLINE void add_denoise_point(int centre_val, int data_val, int thresh,
@@ -5038,17 +5038,17 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
   int sum_weight = 0;
   int sum_val = 0;
   int thresh = strength;
-  int kernal_size = 5;
+  int kernel_size = 5;
   int half_k_size = 2;
   int i, j;
   int max_diff = 0;
   uint8_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
 
   // Find the maximum deviation from the source point in the locale.
   tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
-  for (i = 0; i < kernal_size + 2; ++i) {
-    for (j = 0; j < kernal_size + 2; ++j) {
+  for (i = 0; i < kernel_size + 2; ++i) {
+    for (j = 0; j < kernel_size + 2; ++j) {
       max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j]));
     }
     tmp_ptr += stride;
@@ -5056,19 +5056,19 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
 
   // Select the kernel size.
   if (max_diff > (strength + (strength >> 1))) {
-    kernal_size = 3;
+    kernel_size = 3;
     half_k_size = 1;
     thresh = thresh >> 1;
   }
-  kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+  kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
 
   // Apply the kernel
   tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
-  for (i = 0; i < kernal_size; ++i) {
-    for (j = 0; j < kernal_size; ++j) {
-      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+  for (i = 0; i < kernel_size; ++i) {
+    for (j = 0; j < kernel_size; ++j) {
+      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
                         &sum_val, &sum_weight);
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -5083,17 +5083,17 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
   int sum_weight = 0;
   int sum_val = 0;
   int thresh = strength;
-  int kernal_size = 5;
+  int kernel_size = 5;
   int half_k_size = 2;
   int i, j;
   int max_diff = 0;
   uint16_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
 
   // Find the maximum deviation from the source point in the locale.
   tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
-  for (i = 0; i < kernal_size + 2; ++i) {
-    for (j = 0; j < kernal_size + 2; ++j) {
+  for (i = 0; i < kernel_size + 2; ++i) {
+    for (j = 0; j < kernel_size + 2; ++j) {
       max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j]));
     }
     tmp_ptr += stride;
@@ -5101,19 +5101,19 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
 
   // Select the kernel size.
   if (max_diff > (strength + (strength >> 1))) {
-    kernal_size = 3;
+    kernel_size = 3;
     half_k_size = 1;
     thresh = thresh >> 1;
   }
-  kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+  kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
 
   // Apply the kernel
   tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
-  for (i = 0; i < kernal_size; ++i) {
-    for (j = 0; j < kernal_size; ++j) {
-      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+  for (i = 0; i < kernel_size; ++i) {
+    for (j = 0; j < kernel_size; ++j) {
+      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
                         &sum_val, &sum_weight);
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e4c8a0e4a1..567080a2dd 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -607,10 +607,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
 #define KERNEL_SIZE 3
 
 // Baseline Kernal weights for first pass noise metric
-static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
+static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
                                                              2, 1, 2, 1 };
 
-// Estimate noise at a single point based on the impace of a spatial kernal
+// Estimate noise at a single point based on the impace of a spatial kernel
 // on the point value
 static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int sum_weight = 0;
@@ -620,23 +620,23 @@ static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int diff;
   int dn_diff;
   uint8_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
   uint8_t dn_val;
   uint8_t centre_val = *src_ptr;
 
-  kernal_ptr = fp_dn_kernal_3;
+  kernel_ptr = fp_dn_kernel_3;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - stride - 1;
   for (i = 0; i < KERNEL_SIZE; ++i) {
     for (j = 0; j < KERNEL_SIZE; ++j) {
       diff = abs((int)centre_val - (int)tmp_ptr[j]);
       max_diff = VPXMAX(max_diff, diff);
       if (diff <= FP_DN_THRESH) {
-        sum_weight += *kernal_ptr;
-        sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr;
+        sum_weight += *kernel_ptr;
+        sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr;
       }
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -662,13 +662,13 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int dn_diff;
   uint8_t *tmp_ptr;
   uint16_t *tmp_ptr16;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
   uint16_t dn_val;
   uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr);
 
-  kernal_ptr = fp_dn_kernal_3;
+  kernel_ptr = fp_dn_kernel_3;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - stride - 1;
   for (i = 0; i < KERNEL_SIZE; ++i) {
     tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr);
@@ -676,10 +676,10 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
       diff = abs((int)centre_val - (int)tmp_ptr16[j]);
       max_diff = VPXMAX(max_diff, diff);
       if (diff <= FP_DN_THRESH) {
-        sum_weight += *kernal_ptr;
-        sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr;
+        sum_weight += *kernel_ptr;
+        sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr;
       }
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index 9696529c50..4ee6e51ba8 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -202,7 +202,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
                      VPXMIN(cpi->consec_zero_mv[bl_index1],
                             VPXMIN(cpi->consec_zero_mv[bl_index2],
                                    cpi->consec_zero_mv[bl_index3])));
-          // Only consider blocks that are likely steady background. i.e, have
+          // Only consider blocks that are likely steady background. i.e., have
           // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
           // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
           // 4 sub-blocks for 16x16 block. And exclude this frame if
diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h
index 7920e95ee9..d610a5e159 100644
--- a/vp9/simple_encode.h
+++ b/vp9/simple_encode.h
@@ -309,7 +309,7 @@ struct EncodeFrameResult {
   // The tpl stats stored in the vector is according to the encoding order.
   // For example, suppose there are N show frames for the current GOP.
   // Then tpl_stats_info[0] stores the information of the first frame to be
-  // encoded for this GOP, i.e, the AltRef frame.
+  // encoded for this GOP, i.e., the AltRef frame.
   std::vector<TplStatsInfo> tpl_stats_info;
   ImageBuffer coded_frame;
 
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index c4e04084c8..3c0278c848 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -302,7 +302,7 @@ enum vp8e_enc_control_id {
    * the feature is off, i.e., no golden frame boost in CBR mode and
    * average bitrate target is used.
    *
-   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * For example, to allow 100% more bits, i.e., 2X, in a golden frame
    * than average frame, set this to 100.
    *
    * Supported in codecs: VP9
@@ -598,7 +598,7 @@ enum vp8e_enc_control_id {
    * the feature is off, i.e., no golden frame boost in CBR mode and
    * average bitrate target is used.
    *
-   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * For example, to allow 100% more bits, i.e., 2X, in a golden frame
    * than average frame, set this to 100.
    *
    * Supported in codecs: VP8

From 6a8eb04feccc05dba9b42c0f95405055183a798c Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 7 Jun 2023 12:36:31 -0400
Subject: [PATCH 742/926] Fix more typos (3/n)

Propogation -> Propagation
propogate -> propagate
cant -> can't
upto -> up to
canddiates -> candidates
refernce -> reference
USEAGE -> USAGE

Change-Id: Iadaf2dffd86b54e04411910f667e8c2dfc6c4c77
---
 vp8/encoder/denoising.c          |  2 +-
 vp8/encoder/firstpass.c          |  2 +-
 vp8/encoder/rdopt.c              |  2 +-
 vp9/decoder/vp9_decodemv.c       |  2 +-
 vp9/encoder/vp9_encoder.h        |  2 +-
 vp9/encoder/vp9_firstpass.c      | 12 ++++++------
 vp9/encoder/vp9_mcomp.c          |  6 +++---
 vp9/encoder/vp9_rdopt.c          |  2 +-
 vp9/encoder/vp9_speed_features.c |  2 +-
 9 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index e54d1e9f4b..a666bca4d2 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -135,7 +135,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
     // When adopting aggressive denoiser, the adj_val for each pixel
     // could be at most 8 (this is current max adjustment of the map).
     // In SSE code, we calculate the sum of adj_val for
-    // the columns, so the sum could be upto 128(16 rows). However,
+    // the columns, so the sum could be up to 128(16 rows). However,
     // the range of the value is -128 ~ 127 in SSE code, that's why
     // we do this change in C code.
     // We don't do this for UV denoiser, since there are only 8 rows,
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index f141e7d057..2b88a88c80 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1786,7 +1786,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   alt_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
 #endif
 
-  /* Should we use the alternate refernce frame */
+  /* Should we use the alternate reference frame */
   if (allow_alt_ref && (i >= MIN_GF_INTERVAL) &&
       /* don't use ARF very near next kf */
       (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 7cd42d107c..5d539ef30c 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1021,7 +1021,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
         BLOCK *c;
         BLOCKD *e;
 
-        /* Is the best so far sufficiently good that we cant justify
+        /* Is the best so far sufficiently good that we can't justify
          * doing a new motion search.
          */
         if (best_label_rd < label_mv_thresh) break;
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 22b62e6a2d..7b524fa2a8 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -299,7 +299,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
   }
 }
 
-// Read the referncence frame
+// Read the reference frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             vpx_reader *r, int segment_id,
                             MV_REFERENCE_FRAME ref_frame[2]) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 2e0c4db9ed..f8a27872df 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -846,7 +846,7 @@ typedef struct VP9_COMP {
 
   uint8_t *skin_map;
 
-  // segment threashold for encode breakout
+  // segment threshold for encode breakout
   int segment_encode_breakout[MAX_SEGMENTS];
 
   CYCLIC_REFRESH *cyclic_refresh;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 567080a2dd..d726bc15f4 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -606,7 +606,7 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
 #define FP_MAX_DN_THRESH 24
 #define KERNEL_SIZE 3
 
-// Baseline Kernal weights for first pass noise metric
+// Baseline Kernel weights for first pass noise metric
 static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
                                                              2, 1, 2, 1 };
 
@@ -1458,7 +1458,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
-  // Copy the previous Last Frame back into gf and and arf buffers if
+  // Copy the previous Last Frame back into gf and arf buffers if
   // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
@@ -1675,7 +1675,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 
   // Scan the first pass file and calculate a modified score for each
   // frame that is used to distribute bits. The modified score is assumed
-  // to provide a linear basis for bit allocation. I.e a frame A with a score
+  // to provide a linear basis for bit allocation. I.e., a frame A with a score
   // that is double that of frame B will be allocated 2x as many bits.
   {
     double modified_score_total = 0.0;
@@ -3053,7 +3053,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
 #define MIN_INTRA_LEVEL 0.25
 // Threshold for use of the lagging second reference frame. Scene cuts do not
 // usually have a high second ref usage.
-#define SECOND_REF_USEAGE_THRESH 0.2
+#define SECOND_REF_USAGE_THRESH 0.2
 // Hard threshold where the first pass chooses intra for almost all blocks.
 // In such a case even if the frame is not a scene cut coding a key frame
 // may be a good option.
@@ -3083,7 +3083,7 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
   detect_flash_from_frame_stats(next_frame);
   if (!detect_flash_from_frame_stats(this_frame) &&
       !detect_flash_from_frame_stats(next_frame) &&
-      (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) &&
       ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
        (slide_transition(this_frame, last_frame, next_frame)) ||
        (intra_step_transition(this_frame, last_frame, next_frame)) ||
@@ -3361,7 +3361,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
 
       // The second (lagging) ref error is not valid immediately after
       // a key frame because either the lag has not built up (in the case of
-      // the first key frame or it points to a refernce before the new key
+      // the first key frame or it points to a reference before the new key
       // frame.
       if (i < 2) sr_accumulator = 0.0;
       frame_boost =
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 0ea0f85e42..cbe1c40290 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -953,7 +953,7 @@ static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
   }
 
 #define MAX_PATTERN_SCALES 11
-#define MAX_PATTERN_CANDIDATES 8  // max number of canddiates per scale
+#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
 #define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
 
 // Calculate and return a sad+mvcost list around an integer best pel.
@@ -1034,7 +1034,7 @@ static int vp9_pattern_search(
                      in_what->stride) +
             mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
@@ -1208,7 +1208,7 @@ static int vp9_pattern_search_sad(
                      in_what->stride) +
             mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c0d8b505fe..9454802a5e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2183,7 +2183,7 @@ static int64_t rd_pick_best_sub8x8_mode(
           int cost_list[5];
           const MvLimits tmp_mv_limits = x->mv_limits;
 
-          /* Is the best so far sufficiently good that we cant justify doing
+          /* Is the best so far sufficiently good that we can't justify doing
            * and new motion search. */
           if (best_rd < label_mv_thresh) break;
 
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 48c21c581e..4a7172118c 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -42,7 +42,7 @@ static int frame_is_boosted(const VP9_COMP *cpi) {
 // Sets a partition size down to which the auto partition code will always
 // search (can go lower), based on the image dimensions. The logic here
 // is that the extent to which ringing artefacts are offensive, depends
-// partly on the screen area that over which they propogate. Propogation is
+// partly on the screen area that over which they propagate. Propagation is
 // limited by transform block size but the screen area take up by a given block
 // size will be larger for a small image format stretched to full screen.
 static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) {

From d42b7fd66162be7a94ded287c09461acd7875c8d Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 7 Jun 2023 16:35:19 -0400
Subject: [PATCH 743/926] Fix more typos (n/n)

impace -> impact
taget -> target
prediciton -> prediction
addtion -> addition
the the -> the

Bug: webm:1803
Change-Id: I759c9d930a037ca69662164fcd6be160ed707d77
---
 vp8/encoder/firstpass.c            | 4 ++--
 vp9/encoder/vp9_firstpass.c        | 4 ++--
 vp9/encoder/vp9_svc_layercontext.c | 2 +-
 vp9/vp9_cx_iface.c                 | 2 +-
 vpx_dsp/ppc/variance_vsx.c         | 2 +-
 vpx_dsp/variance.c                 | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 2b88a88c80..ff088aa969 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1537,7 +1537,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
     /* Calculate the baseline boost number for this frame */
     r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out);
 
-    /* We want to discount the the flash frame itself and the recovery
+    /* We want to discount the flash frame itself and the recovery
      * frame that follows as both will have poor scores.
      */
     flash_detected =
@@ -1581,7 +1581,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
     /* Calculate the baseline boost number for this frame */
     r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out);
 
-    /* We want to discount the the flash frame itself and the recovery
+    /* We want to discount the flash frame itself and the recovery
      * frame that follows as both will have poor scores.
      */
     flash_detected =
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index d726bc15f4..27cfc805d7 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -610,7 +610,7 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
 static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
                                                              2, 1, 2, 1 };
 
-// Estimate noise at a single point based on the impace of a spatial kernel
+// Estimate noise at a single point based on the impact of a spatial kernel
 // on the point value
 static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int sum_weight = 0;
@@ -2038,7 +2038,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
         this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
-    // We want to discount the the flash frame itself and the recovery
+    // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
     flash_detected = detect_flash_from_frame_stats(this_frame) ||
                      detect_flash_from_frame_stats(next_frame);
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index e4721271d9..24fd818133 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -1145,7 +1145,7 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
 void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   // For fixed/non-flexible mode, the following constraint are expected,
-  // when inter-layer prediciton is on (default).
+  // when inter-layer prediction is on (default).
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
       svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON &&
       svc->framedrop_mode != LAYER_DROP) {
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5873f30e89..cc2ae20d27 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -2118,7 +2118,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         { 0 },     // ts_rate_decimator
         0,         // ts_periodicity
         { 0 },     // ts_layer_id
-        { 0 },     // layer_taget_bitrate
+        { 0 },     // layer_target_bitrate
         0,         // temporal_layering_mode
         0,         // use_vizier_rc_params
         { 1, 1 },  // active_wq_factor
diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c
index be9614a358..6c6bc9a301 100644
--- a/vpx_dsp/ppc/variance_vsx.c
+++ b/vpx_dsp/ppc/variance_vsx.c
@@ -225,7 +225,7 @@ static INLINE void variance(const uint8_t *src_ptr, int src_stride,
   }
 
 /* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * sse - sum^2 / w*h and returns sse in addition to modifying the passed in
  * variable.
  */
 #define MSE(W, H)                                                         \
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index a6793efb68..1c476542fa 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -180,7 +180,7 @@ static void var_filter_block2d_bil_second_pass(
   }
 
 /* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * sse - sum^2 / w*h and returns sse in addition to modifying the passed in
  * variable.
  */
 #define MSE(W, H)                                                        \

From 2245df50a6d360d33fccd51479c48f2210ed607a Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 8 Jun 2023 10:52:45 -0400
Subject: [PATCH 744/926] Replace NONE with NO_REF_FRAME

NONE is a common name and it has conflicts with symbols defined in
Chromium.

Bug: b/286163500
Change-Id: I3d935a786f771a4d90b258fabc6fd6c2ecbf1c59
---
 vp9/common/vp9_blockd.h       |  2 +-
 vp9/decoder/vp9_decodemv.c    |  8 ++---
 vp9/encoder/vp9_denoiser.c    |  2 +-
 vp9/encoder/vp9_encodeframe.c | 10 +++---
 vp9/encoder/vp9_encoder.c     |  5 +--
 vp9/encoder/vp9_encoder.h     |  4 +--
 vp9/encoder/vp9_firstpass.c   |  2 +-
 vp9/encoder/vp9_mbgraph.c     |  2 +-
 vp9/encoder/vp9_pickmode.c    | 22 ++++++------
 vp9/encoder/vp9_rdopt.c       | 68 +++++++++++++++++------------------
 vp9/simple_encode.cc          |  9 ++---
 11 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index d7de46cf4f..aa13d8a0d5 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -54,7 +54,7 @@ typedef struct {
 // decoder implementation modules critically rely on the defined entry values
 // specified herein. They should be refactored concurrently.
 
-#define NONE (-1)
+#define NO_REF_FRAME (-1)
 #define INTRA_FRAME 0
 #define LAST_FRAME 1
 #define GOLDEN_FRAME 2
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 7b524fa2a8..0989cde58d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -204,7 +204,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
   mi->skip = read_skip(cm, xd, mi->segment_id, r);
   mi->tx_size = read_tx_size(cm, xd, 1, r);
   mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
 
   switch (bsize) {
     case BLOCK_4X4:
@@ -309,7 +309,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
                                                    SEG_LVL_REF_FRAME);
-    ref_frame[1] = NONE;
+    ref_frame[1] = NO_REF_FRAME;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
@@ -333,7 +333,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
         ref_frame[0] = LAST_FRAME;
       }
 
-      ref_frame[1] = NONE;
+      ref_frame[1] = NO_REF_FRAME;
     } else {
       assert(0 && "Invalid prediction mode.");
     }
@@ -383,7 +383,7 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm,
   mi->interp_filter = SWITCHABLE_FILTERS;
 
   mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index b40d5c6154..e5dffa90a8 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -319,7 +319,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
   filter_mbd->plane[2].dst.stride =
       denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
 
-  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+  set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME);
   vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
 
   // Restore everything to its original state
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2381c73687..7280e0da8b 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1437,7 +1437,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
                            &cm->frame_refs[LAST_FRAME - 1].sf);
       mi->ref_frame[0] = LAST_FRAME;
     }
-    mi->ref_frame[1] = NONE;
+    mi->ref_frame[1] = NO_REF_FRAME;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
@@ -1924,7 +1924,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
   mi->skip = 1;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->mv[0].as_int = 0;
   mi->interp_filter = filter_ref;
 
@@ -3449,7 +3449,7 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
                        &cm->frame_refs[ref - 1].sf);
   mi->ref_frame[0] = ref;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->sb_type = bsize;
   vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
   vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method,
@@ -3789,7 +3789,7 @@ static void assign_motion_vector_info(const int block_width_4x4,
       const int col_4x4 = col_start_4x4 + j;
       const int unit_index = row_4x4 * num_unit_cols + col_4x4;
       if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue;
-      if (source_ref_frame[1] == NONE) {
+      if (source_ref_frame[1] == NO_REF_FRAME) {
         assert(source_mv[1]->row == 0 && source_mv[1]->col == 0);
       }
       motion_vector_info[unit_index].ref_frame[0] = source_ref_frame[0];
@@ -5443,7 +5443,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
                            &cm->frame_refs[LAST_FRAME - 1].sf);
       mi->ref_frame[0] = LAST_FRAME;
     }
-    mi->ref_frame[1] = NONE;
+    mi->ref_frame[1] = NO_REF_FRAME;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index d03d87a8a1..4d7f475a01 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2949,7 +2949,7 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
 
 static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
     VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) {
-  MV_REFERENCE_FRAME ref_frame = NONE;
+  MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME;
   if (ref_frame_flag == VP9_LAST_FLAG)
     ref_frame = LAST_FRAME;
   else if (ref_frame_flag == VP9_GOLD_FLAG)
@@ -2957,7 +2957,8 @@ static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
   else if (ref_frame_flag == VP9_ALT_FLAG)
     ref_frame = ALTREF_FRAME;
 
-  return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+  return ref_frame == NO_REF_FRAME ? NULL
+                                   : get_ref_frame_buffer(cpi, ref_frame);
 }
 
 int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index f8a27872df..333ff0b36a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1076,8 +1076,8 @@ static INLINE void free_partition_info(struct VP9_COMP *cpi) {
 }
 
 static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) {
-  mv_info->ref_frame[0] = NONE;
-  mv_info->ref_frame[1] = NONE;
+  mv_info->ref_frame[0] = NO_REF_FRAME;
+  mv_info->ref_frame[1] = NO_REF_FRAME;
   mv_info->mv[0].as_int = INVALID_MV;
   mv_info->mv[1].as_int = INVALID_MV;
 }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 27cfc805d7..22669ab847 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1264,7 +1264,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
         xd->mi[0]->mv[0].as_mv = mv;
         xd->mi[0]->tx_size = TX_4X4;
         xd->mi[0]->ref_frame[0] = LAST_FRAME;
-        xd->mi[0]->ref_frame[1] = NONE;
+        xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
         vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
         vp9_encode_sby_pass1(x, bsize);
         fp_acc_data->sum_mvr += mv.row;
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index fafc673aca..8b6521d915 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -237,7 +237,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   xd->mi[0] = &mi_local;
   mi_local.sb_type = BLOCK_16X16;
   mi_local.ref_frame[0] = LAST_FRAME;
-  mi_local.ref_frame[1] = NONE;
+  mi_local.ref_frame[1] = NO_REF_FRAME;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index fa88cd79da..4a92802dcc 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1398,8 +1398,8 @@ static void recheck_zeromv_after_denoising(
     RD_COST this_rdc;
     mi->mode = ZEROMV;
     mi->ref_frame[0] = LAST_FRAME;
-    mi->ref_frame[1] = NONE;
-    set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+    mi->ref_frame[1] = NO_REF_FRAME;
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
     mi->mv[0].as_int = 0;
     mi->interp_filter = EIGHTTAP;
     if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR;
@@ -1417,7 +1417,7 @@ static void recheck_zeromv_after_denoising(
       this_rdc = *best_rdc;
       mi->mode = ctx_den->best_mode;
       mi->ref_frame[0] = ctx_den->best_ref_frame;
-      set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
       mi->interp_filter = ctx_den->best_pred_filter;
       if (ctx_den->best_ref_frame == INTRA_FRAME) {
         mi->mv[0].as_int = INVALID_MV;
@@ -1681,7 +1681,7 @@ static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
   bp->best_intra_tx_size = TX_SIZES;
   bp->best_pred_filter = EIGHTTAP;
   bp->best_mode_skip_txfm = SKIP_TXFM_NONE;
-  bp->best_second_ref_frame = NONE;
+  bp->best_second_ref_frame = NO_REF_FRAME;
   bp->best_pred = NULL;
 }
 
@@ -1875,8 +1875,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   vp9_rd_cost_reset(&best_rdc);
   vp9_rd_cost_reset(rd_cost);
   mi->sb_type = bsize;
-  mi->ref_frame[0] = NONE;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[0] = NO_REF_FRAME;
+  mi->ref_frame[1] = NO_REF_FRAME;
 
   mi->tx_size =
       VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -2054,7 +2054,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int comp_pred = 0;
     int force_mv_inter_layer = 0;
     PREDICTION_MODE this_mode;
-    second_ref_frame = NONE;
+    second_ref_frame = NO_REF_FRAME;
 
     if (idx < num_inter_modes) {
       this_mode = ref_mode_set[idx].pred_mode;
@@ -2631,7 +2631,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         best_pickmode.best_mode = this_mode;
         best_pickmode.best_intra_tx_size = mi->tx_size;
         best_pickmode.best_ref_frame = INTRA_FRAME;
-        best_pickmode.best_second_ref_frame = NONE;
+        best_pickmode.best_second_ref_frame = NO_REF_FRAME;
         mi->uv_mode = this_mode;
         mi->mv[0].as_int = INVALID_MV;
         mi->mv[1].as_int = INVALID_MV;
@@ -2753,8 +2753,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
-  MV_REFERENCE_FRAME best_ref_frame = NONE;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME;
+  MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME;
   unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   int64_t best_rd = INT64_MAX;
@@ -2793,7 +2793,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   mi->tx_size = TX_4X4;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->interp_filter =
       cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9454802a5e..b7fb26de27 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -86,28 +86,28 @@ struct rdcost_block_args {
 
 #if !CONFIG_REALTIME_ONLY
 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  { NEARESTMV, { LAST_FRAME, NONE } },
-  { NEARESTMV, { ALTREF_FRAME, NONE } },
-  { NEARESTMV, { GOLDEN_FRAME, NONE } },
+  { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { DC_PRED, { INTRA_FRAME, NONE } },
+  { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 
-  { NEWMV, { LAST_FRAME, NONE } },
-  { NEWMV, { ALTREF_FRAME, NONE } },
-  { NEWMV, { GOLDEN_FRAME, NONE } },
+  { NEWMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { NEARMV, { LAST_FRAME, NONE } },
-  { NEARMV, { ALTREF_FRAME, NONE } },
-  { NEARMV, { GOLDEN_FRAME, NONE } },
+  { NEARMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { ZEROMV, { LAST_FRAME, NONE } },
-  { ZEROMV, { GOLDEN_FRAME, NONE } },
-  { ZEROMV, { ALTREF_FRAME, NONE } },
+  { ZEROMV, { LAST_FRAME, NO_REF_FRAME } },
+  { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } },
+  { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } },
 
   { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { TM_PRED, { INTRA_FRAME, NONE } },
+  { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 
   { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -117,20 +117,20 @@ static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
   { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { H_PRED, { INTRA_FRAME, NONE } },
-  { V_PRED, { INTRA_FRAME, NONE } },
-  { D135_PRED, { INTRA_FRAME, NONE } },
-  { D207_PRED, { INTRA_FRAME, NONE } },
-  { D153_PRED, { INTRA_FRAME, NONE } },
-  { D63_PRED, { INTRA_FRAME, NONE } },
-  { D117_PRED, { INTRA_FRAME, NONE } },
-  { D45_PRED, { INTRA_FRAME, NONE } },
+  { H_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { V_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 };
 
 static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
-  { { LAST_FRAME, NONE } },           { { GOLDEN_FRAME, NONE } },
-  { { ALTREF_FRAME, NONE } },         { { LAST_FRAME, ALTREF_FRAME } },
-  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } },
+  { { LAST_FRAME, NO_REF_FRAME } },   { { GOLDEN_FRAME, NO_REF_FRAME } },
+  { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } },
+  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } },
 };
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -1811,7 +1811,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
                               const MV_REFERENCE_FRAME ref_frames[2]) {
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
-      (ref_frames[1] == NONE ||
+      (ref_frames[1] == NO_REF_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
     int rfc = mode_context[ref_frames[0]];
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
@@ -1824,7 +1824,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
       if (c2 > c3) return 0;
     } else {
       assert(this_mode == ZEROMV);
-      if (ref_frames[1] == NONE) {
+      if (ref_frames[1] == NO_REF_FRAME) {
         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
           return 0;
@@ -3241,7 +3241,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   x->skip_encode = 0;
   ctx->skip = 0;
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-  xd->mi[0]->ref_frame[1] = NONE;
+  xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
   // Initialize interp_filter here so we do not have to check for inter block
   // modes in get_pred_context_switchable_interp()
   xd->mi[0]->interp_filter = SWITCHABLE_FILTERS;
@@ -3686,7 +3686,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
           break;
         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break;
-        case NONE:
+        case NO_REF_FRAME:
         case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
       }
     }
@@ -3719,7 +3719,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       MODE_INFO *ref_mi;
       int const_motion = 1;
       int skip_ref_frame = !cb_partition_search_ctrl;
-      MV_REFERENCE_FRAME rf = NONE;
+      MV_REFERENCE_FRAME rf = NO_REF_FRAME;
       int_mv ref_mv;
       ref_mv.as_int = INVALID_MV;
 
@@ -3736,7 +3736,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
       if ((mi_col - 1) >= tile_info->mi_col_start) {
         if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0];
-        if (rf == NONE) rf = xd->mi[-1]->ref_frame[0];
+        if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0];
         for (i = 0; i < mi_height; ++i) {
           ref_mi = xd->mi[i * xd->mi_stride - 1];
           const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
@@ -4230,7 +4230,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
   mi->mode = ZEROMV;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->mv[0].as_int = 0;
   x->skip = 1;
 
@@ -4412,7 +4412,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
           case ALTREF_FRAME:
             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
             break;
-          case NONE:
+          case NO_REF_FRAME:
           case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
         }
       }
@@ -4899,7 +4899,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   }
   // If the second reference does not exist, set the corresponding mv to zero.
-  if (mi->ref_frame[1] == NONE) {
+  if (mi->ref_frame[1] == NO_REF_FRAME) {
     mi->mv[1].as_int = 0;
     for (i = 0; i < 4; ++i) {
       mi->bmi[i].as_mv[1].as_int = 0;
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index f42912d35b..2e2a3746e7 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -183,10 +183,11 @@ static void update_motion_vector_info(
     const MV_REFERENCE_FRAME *in_ref_frame =
         input_motion_vector_info[i].ref_frame;
     output_motion_vector_info[i].mv_count =
-        (in_ref_frame[0] == INTRA_FRAME) ? 0
-                                         : ((in_ref_frame[1] == NONE) ? 1 : 2);
-    if (in_ref_frame[0] == NONE) {
-      fprintf(stderr, "in_ref_frame[0] shouldn't be NONE\n");
+        (in_ref_frame[0] == INTRA_FRAME)
+            ? 0
+            : ((in_ref_frame[1] == NO_REF_FRAME) ? 1 : 2);
+    if (in_ref_frame[0] == NO_REF_FRAME) {
+      fprintf(stderr, "in_ref_frame[0] shouldn't be NO_REF_FRAME\n");
       abort();
     }
     output_motion_vector_info[i].ref_frame[0] =

From 8c308aefea7c58a1a979b81f4aa6d68908e379ee Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Tue, 6 Jun 2023 12:27:34 +0530
Subject: [PATCH 745/926] Fix c vs intrinsic mismatch of vpx_hadamard_32x32()
 function

This CL resolves the mismatch between C and intrinsic implementation
of vpx_hadamard_32x32 function. The mismatch was due to integer
overflow during the addition operation in the intrinsic functions.
Specifically, the addition in the intrinsic function was performed
at the 16-bit level, while the calculation of a0 + a1 resulted in
a 17-bit value.

This code change addresses the problem by performing
the addition at the 32-bit level (with sign extension) in both SSE2
and AVX2, and then converting the results back to the 16-bit level
after a right shift.

STATS_CHANGED

Change-Id: I576ca64e3b9ebb31d143fcd2da64322790bc5853
---
 test/hadamard_test.cc         | 27 ++++++++++++++++++
 vpx_dsp/avg.c                 |  8 +++---
 vpx_dsp/x86/avg_intrin_avx2.c | 53 +++++++++++++++++++++++++++++------
 vpx_dsp/x86/avg_intrin_sse2.c | 53 +++++++++++++++++++++++++++++------
 4 files changed, 121 insertions(+), 20 deletions(-)

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 9f6c99f3c4..0de6622e20 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -170,6 +170,31 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
     EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
   }
 
+  void ExtremeValuesTest() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(b, 0, sizeof(b));
+
+    tran_low_t b_ref[kMaxBlockSize];
+    memset(b_ref, 0, sizeof(b_ref));
+
+    for (int i = 0; i < 2; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      const int sign = (i == 0) ? 1 : -1;
+      for (int j = 0; j < kMaxBlockSize; ++j)
+        input_extreme_block[j] = sign * 255;
+
+      ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_);
+      ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b));
+
+      // The order of the output is not important. Sort before checking.
+      std::sort(b, b + block_size_);
+      std::sort(b_ref, b_ref + block_size_);
+      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+    }
+  }
+
   void VaryStride() {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
@@ -225,6 +250,8 @@ class HadamardLowbdTest : public HadamardTestBase {
 
 TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
 
+TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); }
+
 TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
 
 TEST_P(HadamardLowbdTest, DISABLED_Speed) {
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 391e9eb144..a8dcab7dae 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -295,19 +295,19 @@ void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
     vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
   }
 
-  // coeff: 15 bit, dynamic range [-16320, 16320]
+  // coeff: 16 bit, dynamic range [-32768, 32767]
   for (idx = 0; idx < 256; ++idx) {
     tran_low_t a0 = coeff[0];
     tran_low_t a1 = coeff[256];
     tran_low_t a2 = coeff[512];
     tran_low_t a3 = coeff[768];
 
-    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 17 bit, [-65536, 65535]
     tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
-    tran_low_t b2 = (a2 + a3) >> 2;  // [-16320, 16320]
+    tran_low_t b2 = (a2 + a3) >> 2;  // [-16384, 16383]
     tran_low_t b3 = (a2 - a3) >> 2;
 
-    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[0] = b0 + b2;  // 16 bit, [-32768, 32767]
     coeff[256] = b1 + b3;
     coeff[512] = b0 - b2;
     coeff[768] = b1 - b3;
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index b2e01319d3..61e4e73c5b 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+                                                   __m256i *out_lo,
+                                                   __m256i *out_hi) {
+  const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+  *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
 static void hadamard_col8x2_avx2(__m256i *in, int iter) {
   __m256i a0 = in[0];
   __m256i a1 = in[1];
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   int16_t *t_coeff = coeff;
 #endif
   int idx;
+  __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m256i b0, b1, b2, b3;
+  const __m256i zero = _mm256_setzero_si256();
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
     const int16_t *src_ptr =
@@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
     const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
     const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
 
-    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
-    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
-    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
-    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm256_srai_epi32(b0_lo, 2);
+    b1_lo = _mm256_srai_epi32(b1_lo, 2);
+    b2_lo = _mm256_srai_epi32(b2_lo, 2);
+    b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm256_srai_epi32(b0_hi, 2);
+    b1_hi = _mm256_srai_epi32(b1_hi, 2);
+    b2_hi = _mm256_srai_epi32(b2_hi, 2);
+    b3_hi = _mm256_srai_epi32(b3_hi, 2);
 
-    b0 = _mm256_srai_epi16(b0, 2);
-    b1 = _mm256_srai_epi16(b1, 2);
-    b2 = _mm256_srai_epi16(b2, 2);
-    b3 = _mm256_srai_epi16(b3, 2);
+    b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm256_packs_epi32(b3_lo, b3_hi);
 
     store_tran_low(_mm256_add_epi16(b0, b2), coeff);
     store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 015c11a1f3..4447dfab7c 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -15,6 +15,14 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_ports/mem.h"
 
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+                                                   __m128i *out_lo,
+                                                   __m128i *out_hi) {
+  const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+  *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
                          int *min, int *max) {
   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
   int16_t *t_coeff = coeff;
 #endif
   int idx;
+  __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m128i b0, b1, b2, b3;
+  const __m128i zero = _mm_setzero_si128();
   for (idx = 0; idx < 4; ++idx) {
     const int16_t *src_ptr =
         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
@@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
     __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
     __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
 
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm_srai_epi32(b0_lo, 2);
+    b1_lo = _mm_srai_epi32(b1_lo, 2);
+    b2_lo = _mm_srai_epi32(b2_lo, 2);
+    b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm_srai_epi32(b0_hi, 2);
+    b1_hi = _mm_srai_epi32(b1_hi, 2);
+    b2_hi = _mm_srai_epi32(b2_hi, 2);
+    b3_hi = _mm_srai_epi32(b3_hi, 2);
 
-    b0 = _mm_srai_epi16(b0, 2);
-    b1 = _mm_srai_epi16(b1, 2);
-    b2 = _mm_srai_epi16(b2, 2);
-    b3 = _mm_srai_epi16(b3, 2);
+    b0 = _mm_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm_packs_epi32(b3_lo, b3_hi);
 
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);

From bdb8ccc0af49a87c9f4ee08f1d363ceec347ab6e Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 9 Jun 2023 15:33:39 -0400
Subject: [PATCH 746/926] RTC RC: clean up unnecessary headers

Change-Id: I77c407be59f4eb0c70a89a5fffd88c648e634123
---
 vp9/ratectrl_rtc.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index 7f3c900459..d3876de875 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -14,12 +14,9 @@
 #include <cstdint>
 #include <memory>
 
-#include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/vp9_iface_common.h"
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/vp9_cx_iface.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"
 #include "vpx_mem/vpx_mem.h"

From 8cee267d3d056ea006e0b5bb380742e3da0a5480 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 14 Jun 2023 16:20:30 -0400
Subject: [PATCH 747/926] Add new_mv_count to firstpass stats

Mostly follows the logic of how it's calculated in libaom.

Bug: b/287283080
Change-Id: I9ee67d844ef9db7cca63339b5304459eaa28d324
---
 test/vp9_ethread_test.cc      |  8 ++++----
 vp9/encoder/vp9_encodeframe.c |  2 ++
 vp9/encoder/vp9_encoder.h     |  1 +
 vp9/encoder/vp9_ethread.c     |  1 +
 vp9/encoder/vp9_firstpass.c   | 22 +++++++++++++++++++++-
 vp9/encoder/vp9_firstpass.h   |  2 ++
 6 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 238366cb60..54fa6c48e2 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -21,12 +21,12 @@
 namespace {
 // FIRSTPASS_STATS struct:
 // {
-//   25 double members;
+//   26 double members;
 //   1 int64_t member;
 // }
 // Whenever FIRSTPASS_STATS struct is modified, the following constants need to
 // be revisited.
-const int kDbl = 25;
+const int kDbl = 26;
 const int kInt = 1;
 const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t);
 
@@ -185,7 +185,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Compare to check if using or not using row-mt generates close stats.
-  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0));
 
   // Test single thread vs multiple threads
   row_mt_mode_ = 1;
@@ -199,7 +199,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Compare to check if single-thread and multi-thread stats are close enough.
-  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0));
 
   // Bit exact test in row_mt mode.
   // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 7280e0da8b..0d03d01c80 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -5832,6 +5832,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         TileDataEnc *tile_data =
             &cpi->tile_data[tile_row * tile_cols + tile_col];
         int i, j;
+        const MV zero_mv = { 0, 0 };
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
@@ -5839,6 +5840,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
             tile_data->mode_map[i][j] = j;
           }
         }
+        tile_data->firstpass_top_mv = zero_mv;
 #if CONFIG_MULTITHREAD
         tile_data->row_base_thresh_freq_fact = NULL;
 #endif
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 333ff0b36a..7b02fe7f6b 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -338,6 +338,7 @@ typedef struct TileDataEnc {
 
   // Used for adaptive_rd_thresh with row multithreading
   int *row_base_thresh_freq_fact;
+  MV firstpass_top_mv;
 } TileDataEnc;
 
 typedef struct RowMTInfo {
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index fadd233899..681996d334 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -265,6 +265,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
   tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high;
   tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
   tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+  tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count;
   tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
   tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
   tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 22669ab847..42e935740a 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -152,6 +152,7 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->pcnt_intra_high = 0.0;
   section->inactive_zone_rows = 0.0;
   section->inactive_zone_cols = 0.0;
+  section->new_mv_count = 0.0;
   section->MVr = 0.0;
   section->mvr_abs = 0.0;
   section->MVc = 0.0;
@@ -183,6 +184,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->pcnt_intra_high += frame->pcnt_intra_high;
   section->inactive_zone_rows += frame->inactive_zone_rows;
   section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->new_mv_count += frame->new_mv_count;
   section->MVr += frame->MVr;
   section->mvr_abs += frame->mvr_abs;
   section->MVc += frame->MVc;
@@ -212,6 +214,7 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->pcnt_intra_high -= frame->pcnt_intra_high;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
   section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->new_mv_count -= frame->new_mv_count;
   section->MVr -= frame->MVr;
   section->mvr_abs -= frame->mvr_abs;
   section->MVc -= frame->MVc;
@@ -804,6 +807,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
   fps->inactive_zone_cols = (double)0;
 
   if (fp_acc_data->mvcount > 0) {
+    fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs;
     fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
     fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
     fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
@@ -820,6 +824,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
         (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
     fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
   } else {
+    fps->new_mv_count = 0.0;
     fps->MVr = 0.0;
     fps->mvr_abs = 0.0;
     fps->MVc = 0.0;
@@ -845,6 +850,7 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
   this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low;
   this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high;
   this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+  this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count;
   this_tile->fp_data.mvcount += fp_acc_data->mvcount;
   this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
   this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
@@ -915,6 +921,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
   double mb_neutral_count;
   int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
 
+  MV *first_top_mv = &tile_data->firstpass_top_mv;
+  MV last_nonzero_mv = { 0, 0 };
+
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -955,6 +964,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
 
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
 
+    if (mb_col == mb_col_start) {
+      last_nonzero_mv = *first_top_mv;
+    }
+
     // Adjust to the next column of MBs.
     x->plane[0].src.buf = cpi->Source->y_buffer +
                           mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
@@ -1279,6 +1292,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
 
         if (!is_zero_mv(&mv)) {
           ++(fp_acc_data->mvcount);
+          if (!is_equal_mv(&mv, &last_nonzero_mv)) {
+            ++(fp_acc_data->new_mv_count);
+          }
+          last_nonzero_mv = mv;
 
           // Does the row vector point inwards or outwards?
           if (mb_row < cm->mb_rows / 2) {
@@ -1334,6 +1351,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     }
     fp_acc_data->coded_error += (int64_t)this_error;
 
+    if (mb_col == mb_col_start) {
+      *first_top_mv = last_nonzero_mv;
+    }
     recon_yoffset += 16;
     recon_uvoffset += uv_mb_height;
 
@@ -1356,7 +1376,7 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
   MV best_ref_mv;
   // Tiling is ignored in the first pass.
   vp9_tile_init(tile, cm, 0, 0);
-
+  tile_data.firstpass_top_mv = zero_mv;
 #if CONFIG_RATE_CTRL
   if (cpi->oxcf.use_simple_encode_api) {
     fp_motion_vector_info_reset(cpi->frame_info.frame_width,
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index cdcf568723..3ba336b34f 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -55,6 +55,7 @@ typedef struct {
   int64_t sum_mvcs;
   int sum_in_vectors;
   int intra_smooth_count;
+  int new_mv_count;
 } FIRSTPASS_DATA;
 
 typedef struct {
@@ -83,6 +84,7 @@ typedef struct {
   double mv_in_out_count;
   double duration;
   double count;
+  double new_mv_count;
   int64_t spatial_layer_id;
 } FIRSTPASS_STATS;
 

From af40910197bb4cd24fe2f5870c386843cced70c2 Mon Sep 17 00:00:00 2001
From: Chen Wang <wangchen20@iscas.ac.cn>
Date: Fri, 16 Jun 2023 16:19:02 +0800
Subject: [PATCH 748/926] configure.sh: Improve a comment.

The corresponding case block is not only for ARM.
Original comment text makes reader confused.

Test: N/A, just comment text changes.

Change-Id: I3154d18d3b3d237c1eecfe07dc7ec237c98194cf
Signed-off-by: Chen Wang <wangchen20@iscas.ac.cn>
---
 build/make/configure.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 6fd67f1623..7b2da3c1a1 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -970,7 +970,7 @@ process_common_toolchain() {
       ;;
   esac
 
-  # Process ARM architecture variants
+  # Process architecture variants
   case ${toolchain} in
     arm*)
       # on arm, isa versions are supersets

From 80d4172f0705334db3c51113355ea4704d9a4240 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Wed, 14 Jun 2023 10:27:49 +0530
Subject: [PATCH 749/926] Remove vp9_diamond_search_sad_avx function

This CL removes the avx of vp9_diamond_search_sad function as
there is no speed up seen wrt C.

Change-Id: Ife6005d8e444ea2c8d07ac0f686c840344b9e0ea
---
 vp9/common/vp9_rtcd_defs.pl                  |   2 +-
 vp9/encoder/vp9_encoder.c                    |   4 +-
 vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 300 -------------------
 vp9/vp9cx.mk                                 |   1 -
 4 files changed, 3 insertions(+), 304 deletions(-)
 delete mode 100644 vp9/encoder/x86/vp9_diamond_search_sad_avx.c

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 4b94c31f15..1a4140b38b 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -172,7 +172,7 @@ ()
 # Motion search
 #
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx neon/;
+specialize qw/vp9_diamond_search_sad neon/;
 
 #
 # Apply temporal filter
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 4d7f475a01..aaf42a2a3f 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2187,7 +2187,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
  * The following 2 functions ('cal_nmvjointsadcost' and                *
  * 'cal_nmvsadcosts') are used to calculate cost lookup tables         *
  * used by 'vp9_diamond_search_sad'. The C implementation of the       *
- * function is generic, but the AVX intrinsics optimised version       *
+ * function is generic, but the NEON intrinsics optimised version      *
  * relies on the following properties of the computed tables:          *
  * For cal_nmvjointsadcost:                                            *
  *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]     *
@@ -2196,7 +2196,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
  *         (Equal costs for both components)                           *
  *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                  *
  *         (Cost function is even)                                     *
- * If these do not hold, then the AVX optimised version of the         *
+ * If these do not hold, then the NEON optimised version of the        *
  * 'vp9_diamond_search_sad' function cannot be used as it is, in which *
  * case you can revert to using the C function instead.                *
  ***********************************************************************/
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
deleted file mode 100644
index 63c35df09e..0000000000
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
-  int_mv result;
-  result.as_mv.row = row;
-  result.as_mv.col = col;
-  return result;
-}
-/*****************************************************************************
- * This function utilizes 3 properties of the cost function lookup tables,   *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
- * vp9_encoder.c.                                                            *
- * For the joint cost:                                                       *
- *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
- * For the component costs:                                                  *
- *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
- *         (Equal costs for both components)                                 *
- *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
- *         (Cost function is even)                                           *
- * If these do not hold, then this function cannot be used without           *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties.                                  *
- *****************************************************************************/
-int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
-                               const search_site_config *cfg, MV *ref_mv,
-                               uint32_t start_mv_sad, MV *best_mv,
-                               int search_param, int sad_per_bit, int *num00,
-                               const vp9_sad_fn_ptr_t *sad_fn_ptr,
-                               const MV *center_mv) {
-  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
-  const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
-  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
-  const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int);
-
-  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
-
-  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
-  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
-  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
-  const int tot_steps = cfg->total_steps - search_param;
-
-  const int_mv fcenter_mv =
-      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
-  const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
-
-  const int ref_row = ref_mv->row;
-  const int ref_col = ref_mv->col;
-
-  int_mv bmv = pack_int_mv(ref_row, ref_col);
-  int_mv new_bmv = bmv;
-  __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
-
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const uint8_t *const in_what =
-      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-
-  // Work out the start point for the search
-  const uint8_t *best_address = in_what;
-  const uint8_t *new_best_address = best_address;
-#if VPX_ARCH_X86_64
-  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-  // Starting position
-  unsigned int best_sad = start_mv_sad;
-  int i, j, step;
-
-  // Check the prerequisite cost function properties that are easy to check
-  // in an assert. See the function-level documentation for details on all
-  // prerequisites.
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
-  *num00 = 0;
-
-  for (i = 0, step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
-      __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
-#if VPX_ARCH_X86_64
-      __m128i v_blocka[2];
-#else
-      __m128i v_blocka[1];
-#endif
-
-      // Compute the candidate motion vectors
-      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
-      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
-      // Clamp them to the search bounds
-      __m128i v_these_mv_clamp_w = v_these_mv_w;
-      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
-      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
-      // The ones that did not change are inside the search area
-      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
-
-      // If none of them are inside, then move on
-      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
-        continue;
-      }
-
-      // The inverse mask indicates which of the MVs are outside
-      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff));
-      // Shift right to keep the sign bit clear, we will use this later
-      // to set the cost to the maximum value.
-      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
-
-      // Compute the difference MV
-      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
-      // We utilise the fact that the cost function is even, and use the
-      // absolute difference. This allows us to use unsigned indexes later
-      // and reduces cache pressure somewhat as only a half of the table
-      // is ever referenced.
-      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
-
-      // Compute the SIMD pointer offsets.
-      {
-#if VPX_ARCH_X86_64  //  sizeof(intptr_t) == 8
-        // Load the offsets
-        __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
-        __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
-        // Set the ones falling outside to zero
-        v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d));
-        v_bo32_q =
-            _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d));
-        // Compute the candidate addresses
-        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
-        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
-#else  // VPX_ARCH_X86 //  sizeof(intptr_t) == 4
-        __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
-        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
-        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
-#endif
-      }
-
-      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                         in_what_stride, (uint32_t *)&v_sad_d);
-
-      // Look up the component cost of the residual motion vector
-      {
-        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
-        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
-        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
-        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
-        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
-        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
-        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
-        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
-
-        // Note: This is a use case for vpgather in AVX2
-        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
-        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
-        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
-        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
-
-        __m128i v_cost_10_d, v_cost_32_d;
-        v_cost_10_d = _mm_cvtsi32_si128(cost0);
-        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
-        v_cost_32_d = _mm_cvtsi32_si128(cost2);
-        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
-        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
-      }
-
-      // Now add in the joint cost
-      {
-        const __m128i v_sel_d =
-            _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128());
-        const __m128i v_joint_cost_d =
-            _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d);
-        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
-      }
-
-      // Multiply by sad_per_bit
-      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
-      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
-      v_cost_d = _mm_add_epi32(v_cost_d,
-                               _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
-      v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
-      // Add the cost to the sad
-      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
-
-      // Make the motion vectors outside the search area have max cost
-      // by or'ing in the comparison mask, this way the minimum search won't
-      // pick them.
-      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
-
-      // Find the minimum value and index horizontally in v_sad_d
-      {
-        // Try speculatively on 16 bits, so we can use the minpos intrinsic
-        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
-        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
-
-        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
-        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
-
-        // If the local best value is not saturated, just use it, otherwise
-        // find the horizontal minimum again the hard way on 32 bits.
-        // This is executed rarely.
-        if (UNLIKELY(local_best_sad == 0xffff)) {
-          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
-
-          // Re-arrange the values in v_sad_d as follows:
-          // v_loval_d[0] = v_sad_d[0], v_loval_d[1] = v_sad_d[2]
-          // v_loval_d[2] = v_sad_d[1], v_loval_d[3] = v_sad_d[3]
-          // v_loidx_d stores the corresponding indices 0, 2, 1, 3
-          // This re-arrangement is required to ensure that when there exists
-          // more than one minimum, the one with the least index is selected
-          v_loval_d = _mm_shuffle_epi32(v_sad_d, 0xd8);
-          v_loidx_d = _mm_set_epi32(3, 1, 2, 0);
-
-          v_hival_d = _mm_srli_si128(v_loval_d, 8);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
-
-          // Compare if v_sad_d[1] < v_sad_d[0], v_sad_d[3] < v_sad_d[2]
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-          v_hival_d = _mm_srli_si128(v_loval_d, 4);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
-
-          // min(v_sad_d[2], v_sad_d[3]) < min(v_sad_d[0], v_sad_d[1])
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-
-          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
-          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
-        }
-
-        // Update the global minimum if the local minimum is smaller
-        if (LIKELY(local_best_sad < best_sad)) {
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
-          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
-          best_sad = local_best_sad;
-        }
-      }
-    }
-
-    bmv = new_bmv;
-    best_address = new_best_address;
-
-    v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
-#if VPX_ARCH_X86_64
-    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-    if (UNLIKELY(best_address == in_what)) {
-      (*num00)++;
-    }
-  }
-
-  *best_mv = bmv.as_mv;
-  return best_sad;
-}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 3e5c0e1a7f..dd9475bcb0 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -117,7 +117,6 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c

From 19f3a754d62dcd21e400a3c715f2ed4235d1c4ec Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 26 Jun 2023 14:57:53 -0700
Subject: [PATCH 750/926] Fix a bug in vpx_hadamard_32x32_neon()

A right shift by 2 is equivalent to two halving operations if there is
no no addition or subtraction between the two halving operations.

Note: Since vhaddq_s16() and vhsubq_s16() have 17-bit intermediate
precision, the Neon code doesn't need to go to int32_t as was done in
https://chromium-review.googlesource.com/c/webm/libvpx/+/4604169.

Change-Id: Ibe0691cde0fd3b94ee7c497845ba459d30d503b0
---
 vpx_dsp/arm/hadamard_neon.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index f6b6d7e3ce..f5a044be4d 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -138,15 +138,15 @@ void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
     const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
 
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
+    const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1);
+    const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1);
+    const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1);
+    const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1);
 
-    const int16x8_t c0 = vhaddq_s16(b0, b2);
-    const int16x8_t c1 = vhaddq_s16(b1, b3);
-    const int16x8_t c2 = vhsubq_s16(b0, b2);
-    const int16x8_t c3 = vhsubq_s16(b1, b3);
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
 
     store_s16q_to_tran_low(coeff + 0, c0);
     store_s16q_to_tran_low(coeff + 256, c1);

From 885ecc7c667eac3521d4558b2be554d96c95da41 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 23 Jun 2023 19:27:26 -0700
Subject: [PATCH 751/926] vp9_dx_iface: fix leaks on init_decoder() failure

If any allocations fail in init_decoder() and the application continues
to call vpx_codec_decode() some of the allocations would be orphaned or
the decoder would be left in a partially initialized state.

Found with vpx_dec_fuzzer_vp9 & Nallocfuzz
(https://github.com/catenacyber/nallocfuzz).

Bug: webm:1807
Change-Id: I44f662526d715ecaeac6180070af40672cd42611
---
 vp9/vp9_dx_iface.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 20e71cc227..a242c776cd 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -256,6 +256,7 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
   } while (0)
 
 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  vpx_codec_err_t res;
   ctx->last_show_frame = -1;
   ctx->need_resync = 1;
   ctx->flushed = 0;
@@ -265,6 +266,8 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
 
   ctx->pbi = vp9_decoder_create(ctx->buffer_pool);
   if (ctx->pbi == NULL) {
+    vpx_free(ctx->buffer_pool);
+    ctx->buffer_pool = NULL;
     set_error_detail(ctx, "Failed to allocate decoder");
     return VPX_CODEC_MEM_ERROR;
   }
@@ -282,7 +285,14 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
   if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
     set_default_ppflags(&ctx->postproc_cfg);
 
-  return init_buffer_callbacks(ctx);
+  res = init_buffer_callbacks(ctx);
+  if (res != VPX_CODEC_OK) {
+    vpx_free(ctx->buffer_pool);
+    ctx->buffer_pool = NULL;
+    vp9_decoder_remove(ctx->pbi);
+    ctx->pbi = NULL;
+  }
+  return res;
 }
 
 static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,

From 02ab555e992c191e5c509ed87b3cc48ed915b447 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 26 Jun 2023 19:06:51 -0700
Subject: [PATCH 752/926] vp9_alloccommon: clear allocation sizes on free

This fixes reallocations (and avoids potential crashes) if any
allocations fails and the application continues to call
vpx_codec_decode().

Found with vpx_dec_fuzzer_vp9 & Nallocfuzz
(https://github.com/catenacyber/nallocfuzz).

Bug: webm:1807
Change-Id: If5dc96b73c02efc94ec84c25eb50d10ad6b645a6
---
 vp9/common/vp9_alloccommon.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index faad657a08..e53883f621 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -65,6 +65,7 @@ static void free_seg_map(VP9_COMMON *cm) {
     vpx_free(cm->seg_map_array[i]);
     cm->seg_map_array[i] = NULL;
   }
+  cm->seg_map_alloc_size = 0;
 
   cm->current_frame_seg_map = NULL;
   cm->last_frame_seg_map = NULL;
@@ -108,6 +109,7 @@ void vp9_free_context_buffers(VP9_COMMON *cm) {
   cm->above_context = NULL;
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
+  cm->above_context_alloc_cols = 0;
   vpx_free(cm->lf.lfm);
   cm->lf.lfm = NULL;
 }

From a31e818ef8ae3b1b791187bca4fbca1a5c191736 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 26 Jun 2023 19:09:24 -0700
Subject: [PATCH 753/926] vp9_decodeframe,init_mt: free tile_workers on alloc
 failure

This avoids a crash if any of the thread allocations fail and the
application continues to call vpx_codec_decode(). Previously
num_tile_workers would be non-zero, but not equal to num_threads, which
would cause a crash during later thread management.

Found with vpx_dec_fuzzer_vp9 & Nallocfuzz
(https://github.com/catenacyber/nallocfuzz).

Bug: webm:1807
Change-Id: Ie3faf7b36764aebedac0924acb6e4cb7545aec7d
---
 vp9/decoder/vp9_decodeframe.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6ec1d9f668..c5892156f4 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -2293,6 +2293,11 @@ static INLINE void init_mt(VP9Decoder *pbi) {
 
       winterface->init(worker);
       if (n < num_threads - 1 && !winterface->reset(worker)) {
+        do {
+          winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
+        } while (--pbi->num_tile_workers != 0);
+        vpx_free(pbi->tile_workers);
+        pbi->tile_workers = NULL;
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }

From 263ddc9e384fc747714210df1866b1200227dee1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 26 Jun 2023 19:18:55 -0700
Subject: [PATCH 754/926] vp8_decode: clear stream info on decoder create
 failure

This fixes a crash if the application continues to call
vpx_codec_decode().

Found with vpx_dec_fuzzer_vp8 & Nallocfuzz
(https://github.com/catenacyber/nallocfuzz).

Bug: webm:1807
Change-Id: I9867f5fc3d1163026f521a9609d3cbbc00568d1d
---
 vp8/vp8_dx_iface.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index fdc0b35dd4..8f73d9f83f 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -350,7 +350,14 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
     }
 
     res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
-    if (res == VPX_CODEC_OK) ctx->decoder_init = 1;
+    if (res == VPX_CODEC_OK) {
+      ctx->decoder_init = 1;
+    } else {
+      /* on failure clear the cached resolution to ensure a full
+       * reallocation is attempted on resync. */
+      ctx->si.w = 0;
+      ctx->si.h = 0;
+    }
   }
 
   /* Set these even if already initialized.  The caller may have changed the

From a166c52d3a2e72d0fe4dbc8909523c6ad8fbdfb2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 26 Jun 2023 19:22:00 -0700
Subject: [PATCH 755/926] vp8_decode: only remove threads on thread create
 failure

This fixes a crash if the application continues to call
vpx_codec_decode(). Previously the decoder instance would be freed,
causing a crash when attempting to access it with restart_threads=1.

Found with vpx_dec_fuzzer_vp8 & Nallocfuzz
(https://github.com/catenacyber/nallocfuzz).

Bug: webm:1807
Change-Id: Ic084894b776729bb1572f747082cef002f0832a8
---
 vp8/vp8_dx_iface.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 8f73d9f83f..ed53a8625b 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -306,13 +306,11 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
 #if CONFIG_MULTITHREAD
   if (!res && ctx->restart_threads) {
-    struct frame_buffers *fb = &ctx->yv12_frame_buffers;
     VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
     VP8_COMMON *const pc = &pbi->common;
     if (setjmp(pbi->common.error.jmp)) {
       pbi->common.error.setjmp = 0;
-      vp8_remove_decoder_instances(fb);
-      vp8_zero(fb->pbi);
+      vp8_decoder_remove_threads(pbi);
       vpx_clear_system_state();
       return VPX_CODEC_ERROR;
     }

From 44a5eaa3ba890849fa7db14b8b8cfb1bac876c80 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 26 Jun 2023 19:25:56 -0700
Subject: [PATCH 756/926] vp8_decode: fix keyframe resync after decode error

This fixes a crash if the application continues to call
vpx_codec_decode(). Previously a non-keyframe could cause a crash if the
decoder failed before fully initializing due to an allocation failure.
The stream info and frame resolution would be 0, skipping an allocation.

Found with vpx_dec_fuzzer_vp8 & Nallocfuzz
(https://github.com/catenacyber/nallocfuzz).

Bug: webm:1807
Change-Id: I1c17302f4d3a488ba3b4eefe0bf53853dc558bc1
---
 vp8/vp8_dx_iface.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index ed53a8625b..9e622e3b97 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -162,7 +162,10 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
       si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
 
       /*printf("w=%d, h=%d\n", si->w, si->h);*/
-      if (!(si->h && si->w)) res = VPX_CODEC_CORRUPT_FRAME;
+      if (!(si->h && si->w)) {
+        si->w = si->h = 0;
+        res = VPX_CODEC_CORRUPT_FRAME;
+      }
     } else {
       res = VPX_CODEC_UNSUP_BITSTREAM;
     }
@@ -301,6 +304,16 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
   }
 
   if (!ctx->decoder_init && !ctx->si.is_kf) res = VPX_CODEC_UNSUP_BITSTREAM;
+  if (!res && ctx->decoder_init && w == 0 && h == 0 && ctx->si.h == 0 &&
+      ctx->si.w == 0) {
+    VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    assert(pbi != NULL);
+    assert(!pbi->common.error.setjmp);
+    res = VPX_CODEC_CORRUPT_FRAME;
+    vpx_internal_error(&pbi->common.error, res,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
 
   if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1;
 

From 3bd65ac7769f68e9319ad1ab5fe7c664121d373b Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 28 Jun 2023 10:04:21 -0400
Subject: [PATCH 757/926] vp9 firstpass stats in a separate header

Change-Id: If91c5c74c71affc48eb858beb314a6c194b14023
---
 vp9/encoder/vp9_firstpass.h       | 31 +-----------------
 vp9/encoder/vp9_firstpass_stats.h | 54 +++++++++++++++++++++++++++++++
 vp9/vp9cx.mk                      |  1 +
 3 files changed, 56 insertions(+), 30 deletions(-)
 create mode 100644 vp9/encoder/vp9_firstpass_stats.h

diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 3ba336b34f..a19b04db74 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -14,6 +14,7 @@
 #include <assert.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/encoder/vp9_firstpass_stats.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 
@@ -58,36 +59,6 @@ typedef struct {
   int new_mv_count;
 } FIRSTPASS_DATA;
 
-typedef struct {
-  double frame;
-  double weight;
-  double intra_error;
-  double coded_error;
-  double sr_coded_error;
-  double frame_noise_energy;
-  double pcnt_inter;
-  double pcnt_motion;
-  double pcnt_second_ref;
-  double pcnt_neutral;
-  double pcnt_intra_low;   // Coded intra but low variance
-  double pcnt_intra_high;  // Coded intra high variance
-  double intra_skip_pct;
-  double intra_smooth_pct;    // % of blocks that are smooth
-  double inactive_zone_rows;  // Image mask rows top and bottom.
-  double inactive_zone_cols;  // Image mask columns at left and right edges.
-  double MVr;
-  double mvr_abs;
-  double MVc;
-  double mvc_abs;
-  double MVrv;
-  double MVcv;
-  double mv_in_out_count;
-  double duration;
-  double count;
-  double new_mv_count;
-  int64_t spatial_layer_id;
-} FIRSTPASS_STATS;
-
 typedef enum {
   KF_UPDATE = 0,
   LF_UPDATE = 1,
diff --git a/vp9/encoder/vp9_firstpass_stats.h b/vp9/encoder/vp9_firstpass_stats.h
new file mode 100644
index 0000000000..01928e7816
--- /dev/null
+++ b/vp9/encoder/vp9_firstpass_stats.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double frame;
+  double weight;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double frame_noise_energy;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double pcnt_intra_low;   // Coded intra but low variance
+  double pcnt_intra_high;  // Coded intra high variance
+  double intra_skip_pct;
+  double intra_smooth_pct;    // % of blocks that are smooth
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double duration;
+  double count;
+  double new_mv_count;
+  int64_t spatial_layer_id;
+} FIRSTPASS_STATS;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index dd9475bcb0..44790ef6a4 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -40,6 +40,7 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
 VP9_CX_SRCS-yes += encoder/vp9_extend.h
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h
 VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c
 VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c

From 3ecba398023dea731520cc1489159bfd0ad0a200 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 2 Jun 2023 18:49:00 -0700
Subject: [PATCH 758/926] Fix Clang -Wunreachable-code-aggressive warnings

Based on the change in libaom:
fe36011455 Fix Clang -Wunreachable-code-aggressive warnings

Clang's -Wunreachable-code-aggressive flag enables several warning flags
such as -Wunreachable-code-break and -Wunreachable-code-return. Chrome's
build system enables -Wunreachable-code-aggressive (in
build/config/compiler/BUILD.gn), so it would be good if libvpx could be
compiled without -Wunreachable-code-aggressive warnings.

This requires the VPX_NO_RETURN macro be defined correctly for all the
compilers we support, otherwise some compilers may warn about missing
return statements after a die() or fatal() call (which does not return).

Change-Id: I0c069133af45a7a61759538b6d74c681ea087dcd
---
 args.c                             |  3 ---
 configure                          |  2 +-
 examples/vp9_spatial_svc_encoder.c |  1 -
 test/partial_idct_test.cc          |  2 +-
 test/vp9_ratectrl_rtc_test.cc      |  2 +-
 tools_common.c                     | 10 +++++-----
 vp8/decoder/decodeframe.c          |  2 +-
 vp8/encoder/firstpass.c            | 10 +++++-----
 vp9/encoder/vp9_ext_ratectrl.c     |  1 -
 vp9/encoder/vp9_firstpass.c        |  5 +----
 vp9/encoder/vp9_mbgraph.c          | 25 +++++++++----------------
 vp9/simple_encode.cc               |  1 -
 vpxdec.c                           |  2 +-
 13 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/args.c b/args.c
index 4afb9c021a..0a9631e1f4 100644
--- a/args.c
+++ b/args.c
@@ -135,7 +135,6 @@ unsigned int arg_parse_uint(const struct arg *arg) {
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
 }
 
 int arg_parse_int(const struct arg *arg) {
@@ -152,7 +151,6 @@ int arg_parse_int(const struct arg *arg) {
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
 }
 
 struct vpx_rational {
@@ -209,7 +207,6 @@ int arg_parse_enum(const struct arg *arg) {
     if (!strcmp(arg->val, listptr->name)) return listptr->val;
 
   die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
-  return 0;
 }
 
 int arg_parse_enum_or_int(const struct arg *arg) {
diff --git a/configure b/configure
index b73436b47e..67bba946f6 100755
--- a/configure
+++ b/configure
@@ -651,7 +651,7 @@ process_toolchain() {
         check_add_cflags -Wmissing-prototypes
         check_add_cflags -Wshadow
         check_add_cflags -Wuninitialized
-        check_add_cflags -Wunreachable-code-loop-increment
+        check_add_cflags -Wunreachable-code-aggressive
         check_add_cflags -Wunused
         check_add_cflags -Wextra
         # check_add_cflags also adds to cxxflags. gtest does not do well with
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index 9d37ed0244..998e4fb20d 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -316,7 +316,6 @@ static void parse_command_line(int argc, const char **argv_,
           break;
         default:
           die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
-          break;
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) {
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 7eb888a586..b7c0c050af 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -76,7 +76,7 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
       case TX_8X8: size_ = 8; break;
       case TX_16X16: size_ = 16; break;
       case TX_32X32: size_ = 32; break;
-      default: FAIL() << "Wrong Size!"; break;
+      default: FAIL() << "Wrong Size!";
     }
 
     // Randomize stride_ to a value less than or equal to 1024
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 5abda1290a..8422df074b 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -227,7 +227,7 @@ class RcInterfaceSvcTest
         rc_cfg_.layer_target_bitrate[4] = 0;
         rc_cfg_.layer_target_bitrate[5] = 0;
         ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
-      } else if (0 && video->frame() == 280) {
+      } else if (/*DISABLES CODE*/ (0) && video->frame() == 280) {
         // TODO(marpan): Re-enable this going back up when issue is fixed.
         // Go back up to 3 spatial layers.
         // Update the encoder config: use the original bitrates.
diff --git a/tools_common.c b/tools_common.c
index cbecfbb419..0de15558dd 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -375,7 +375,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
     case VPX_IMG_FMT_I44016: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -411,7 +411,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -452,7 +452,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -487,7 +487,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
     case VPX_IMG_FMT_I44016: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -521,7 +521,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index d014cf9667..729cd9980f 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -1167,7 +1167,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1;
 #endif
 
-  if (0) {
+  if (/*DISABLES CODE*/ (0)) {
     FILE *z = fopen("decodestats.stt", "a");
     fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame,
             pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame,
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index ff088aa969..5f372912fe 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -822,7 +822,7 @@ void vp8_first_pass(VP8_COMP *cpi) {
   }
 
   /* use this to see what the first pass reconstruction looks like */
-  if (0) {
+  if (/*DISABLES CODE*/ (0)) {
     char filename[512];
     FILE *recon_file;
     sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame);
@@ -1038,7 +1038,7 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
   double clip_iifactor;
   int overhead_bits_per_mb;
 
-  if (0) {
+  if (/*DISABLES CODE*/ (0)) {
     FILE *f = fopen("epmp.stt", "a");
     fprintf(f, "%10.2f\n", err_per_mb);
     fclose(f);
@@ -1230,7 +1230,7 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
     Q++;
   }
 
-  if (0) {
+  if (/*DISABLES CODE*/ (0)) {
     FILE *f = fopen("estkf_q.stt", "a");
     fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n",
             cpi->common.current_video_frame, bits_per_mb_at_this_q,
@@ -3047,7 +3047,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
               (int)((projected_bits_perframe - av_bits_per_frame) *
                     cpi->twopass.frames_to_key));
 
-    if (0) {
+    if (/*DISABLES CODE*/ (0)) {
       FILE *f = fopen("Subsamle.stt", "a");
       fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",
               cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale,
@@ -3121,7 +3121,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio,
                                    (int)bits_per_frame, group_iiratio);
 
-        if (0) {
+        if (/*DISABLES CODE*/ (0)) {
           FILE *f = fopen("Subsamle.stt", "a");
           fprintf(
               f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q,
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 1d440442b5..b08fd63c3c 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -131,7 +131,6 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
     default:
       fprintf(stderr, "Unsupported update_type %d\n", update_type);
       abort();
-      return 1;
   }
 }
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 42e935740a..de7a3829ab 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -364,7 +364,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_8_mse8x16;
         default: return vpx_highbd_8_mse16x16;
       }
-      break;
     case 10:
       switch (bsize) {
         case BLOCK_8X8: return vpx_highbd_10_mse8x8;
@@ -372,7 +371,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_10_mse8x16;
         default: return vpx_highbd_10_mse16x16;
       }
-      break;
     case 12:
       switch (bsize) {
         case BLOCK_8X8: return vpx_highbd_12_mse8x8;
@@ -380,7 +378,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_12_mse8x16;
         default: return vpx_highbd_12_mse16x16;
       }
-      break;
   }
 }
 
@@ -1508,7 +1505,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   }
 
   // Use this to see what the first pass reconstruction looks like.
-  if (0) {
+  if (/*DISABLES CODE*/ (0)) {
     char filename[512];
     FILE *recon_file;
     snprintf(filename, sizeof(filename), "enc%04d.yuv",
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 8b6521d915..2f20a8fe6d 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -333,23 +333,16 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     }
   }
 
-  // Only bother with segmentation if over 10% of the MBs in static segment
-  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
-  if (1) {
-    // Note % of blocks that are marked as static
-    if (cm->MBs)
-      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
-
-    // This error case should not be reachable as this function should
-    // never be called with the common data structure uninitialized.
-    else
-      cpi->static_mb_pct = 0;
-
-    vp9_enable_segmentation(&cm->seg);
-  } else {
+  // Note % of blocks that are marked as static
+  if (cm->MBs)
+    cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+
+  // This error case should not be reachable as this function should
+  // never be called with the common data structure uninitialized.
+  else
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation(&cm->seg);
-  }
+
+  vp9_enable_segmentation(&cm->seg);
 
   // Free localy allocated storage
   vpx_free(arf_not_zz);
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 2e2a3746e7..2e6f9a4513 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -143,7 +143,6 @@ get_frame_type_from_update_type(FRAME_UPDATE_TYPE update_type) {
     default:
       fprintf(stderr, "Unsupported update_type %d\n", update_type);
       abort();
-      return kFrameTypeInter;
   }
 }
 
diff --git a/vpxdec.c b/vpxdec.c
index 54a41f0799..bfe6c1d6ba 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -446,7 +446,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len,
         case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
         case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
         case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
-        default: die("Unrecognized pattern %%%c\n", p[1]); break;
+        default: die("Unrecognized pattern %%%c\n", p[1]);
       }
 
       pat_len = strlen(q);

From dc26707f80686031905e5f013daf37062e61f6d2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 28 Jun 2023 12:26:32 -0700
Subject: [PATCH 759/926] delete some dead code

follow-up to:
3ecba3980 Fix Clang -Wunreachable-code-aggressive warnings

Change-Id: I364312987bc838c69c010cce024bd3d62a918417
---
 vp8/decoder/decodeframe.c   |  8 ------
 vp8/encoder/firstpass.c     | 54 -------------------------------------
 vp9/encoder/vp9_firstpass.c | 16 -----------
 3 files changed, 78 deletions(-)

diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 729cd9980f..af9a98c1de 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -1167,14 +1167,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1;
 #endif
 
-  if (/*DISABLES CODE*/ (0)) {
-    FILE *z = fopen("decodestats.stt", "a");
-    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame,
-            pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame,
-            pc->refresh_last_frame, pc->base_qindex);
-    fclose(z);
-  }
-
   {
     pbi->independent_partitions = 1;
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 5f372912fe..4443f5e7cd 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -821,22 +821,6 @@ void vp8_first_pass(VP8_COMP *cpi) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
   }
 
-  /* use this to see what the first pass reconstruction looks like */
-  if (/*DISABLES CODE*/ (0)) {
-    char filename[512];
-    FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame);
-
-    if (cm->current_video_frame == 0) {
-      recon_file = fopen(filename, "wb");
-    } else {
-      recon_file = fopen(filename, "ab");
-    }
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-
   cm->current_video_frame++;
 }
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
@@ -1038,12 +1022,6 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
   double clip_iifactor;
   int overhead_bits_per_mb;
 
-  if (/*DISABLES CODE*/ (0)) {
-    FILE *f = fopen("epmp.stt", "a");
-    fprintf(f, "%10.2f\n", err_per_mb);
-    fclose(f);
-  }
-
   target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
                                 ? (512 * section_target_bandwitdh) / num_mbs
                                 : 512 * (section_target_bandwitdh / num_mbs);
@@ -1230,17 +1208,6 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
     Q++;
   }
 
-  if (/*DISABLES CODE*/ (0)) {
-    FILE *f = fopen("estkf_q.stt", "a");
-    fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n",
-            cpi->common.current_video_frame, bits_per_mb_at_this_q,
-            target_norm_bits_per_mb, err_per_mb, err_correction_factor,
-            current_spend_ratio, group_iiratio, iiratio_correction_factor,
-            (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level,
-            Q);
-    fclose(f);
-  }
-
   return Q;
 }
 
@@ -3047,16 +3014,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
               (int)((projected_bits_perframe - av_bits_per_frame) *
                     cpi->twopass.frames_to_key));
 
-    if (/*DISABLES CODE*/ (0)) {
-      FILE *f = fopen("Subsamle.stt", "a");
-      fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",
-              cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale,
-              cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key,
-              (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key),
-              new_height, new_width);
-      fclose(f);
-    }
-
     /* The trigger for spatial resampling depends on the various
      * parameters such as whether we are streaming (CBR) or VBR.
      */
@@ -3120,17 +3077,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
          */
         kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio,
                                    (int)bits_per_frame, group_iiratio);
-
-        if (/*DISABLES CODE*/ (0)) {
-          FILE *f = fopen("Subsamle.stt", "a");
-          fprintf(
-              f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q,
-              cpi->common.horiz_scale, cpi->common.vert_scale,
-              kf_group_err / cpi->twopass.frames_to_key,
-              (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key),
-              new_height, new_width);
-          fclose(f);
-        }
       }
     }
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index de7a3829ab..bd203f1e21 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1504,22 +1504,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
                cm->ref_frame_map[cpi->lst_fb_idx]);
   }
 
-  // Use this to see what the first pass reconstruction looks like.
-  if (/*DISABLES CODE*/ (0)) {
-    char filename[512];
-    FILE *recon_file;
-    snprintf(filename, sizeof(filename), "enc%04d.yuv",
-             (int)cm->current_video_frame);
-
-    if (cm->current_video_frame == 0)
-      recon_file = fopen(filename, "wb");
-    else
-      recon_file = fopen(filename, "ab");
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-
   // In the first pass, every frame is considered as a show frame.
   update_frame_indexes(cm, /*show_frame=*/1);
   if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);

From 3ef9934789c0b1de47c72abd7362072ba89c0d8b Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 28 Jun 2023 16:09:36 -0700
Subject: [PATCH 760/926] Fix a bug in vpx_highbd_hadamard_32x32_neon().

This CL is the highbd version of
https://chromium-review.googlesource.com/c/webm/libvpx/+/4646573.

The bug is caused by the incorrect assumption that
(a / 2) + (b / 2) == (a + b) / 2 and (a / 2) - (b / 2) == (a - b) / 2.

Also fix the Rand() inputs to Hadamard functions in unit tests.

This CL ports the following libaom CLs to libvpx:
https://aomedia-review.googlesource.com/c/aom/+/177101
https://aomedia-review.googlesource.com/c/aom/+/177241

Change-Id: Ic20e7684eab5d6507417fa2b75e572064d37ad2c
---
 test/acm_random.h                  | 15 +++++----------
 test/hadamard_test.cc              | 20 ++++++++++++++++++--
 vpx_dsp/arm/highbd_hadamard_neon.c | 16 ++++++++--------
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/test/acm_random.h b/test/acm_random.h
index c7122b9338..e3520c47de 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -45,16 +45,11 @@ class ACMRandom {
     return static_cast<int16_t>(random_.Generate(65536));
   }
 
-  int16_t Rand13Signed() {
-    // Use 13 bits: values between 4095 and -4096.
-    const uint32_t value = random_.Generate(8192);
-    return static_cast<int16_t>(value) - 4096;
-  }
-
-  int16_t Rand9Signed() {
-    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
-    const uint32_t value = random_.Generate(512);
-    return static_cast<int16_t>(value) - 256;
+  uint16_t Rand12() {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 19) & 0xfff;
   }
 
   uint8_t Rand8() {
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 0de6622e20..9ba898b519 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -137,6 +137,12 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  // The Rand() function generates values in the range [-((1 << BitDepth) - 1),
+  // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform
+  // is the residual pixel, which is defined as 'source pixel - predicted
+  // pixel'. Source pixel and predicted pixel take values in the range
+  // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from
+  // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1).
   virtual int16_t Rand() = 0;
 
   void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
@@ -245,7 +251,12 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
 
 class HadamardLowbdTest : public HadamardTestBase {
  protected:
-  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+  // Use values between -255 (0xFF01) and 255 (0x00FF)
+  virtual int16_t Rand() {
+    int16_t src = rnd_.Rand8();
+    int16_t pred = rnd_.Rand8();
+    return src - pred;
+  }
 };
 
 TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
@@ -323,7 +334,12 @@ INSTANTIATE_TEST_SUITE_P(
 #if CONFIG_VP9_HIGHBITDEPTH
 class HadamardHighbdTest : public HadamardTestBase {
  protected:
-  virtual int16_t Rand() { return rnd_.Rand13Signed(); }
+  // Use values between -4095 (0xF001) and 4095 (0x0FFF)
+  virtual int16_t Rand() {
+    int16_t src = rnd_.Rand12();
+    int16_t pred = rnd_.Rand12();
+    return src - pred;
+  }
 };
 
 TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c
index 499eb65462..7be88f6bcb 100644
--- a/vpx_dsp/arm/highbd_hadamard_neon.c
+++ b/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -197,15 +197,15 @@ void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff,
     int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512);
     int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768);
 
-    int32x4_t b0 = vhaddq_s32(a0, a1);
-    int32x4_t b1 = vhsubq_s32(a0, a1);
-    int32x4_t b2 = vhaddq_s32(a2, a3);
-    int32x4_t b3 = vhsubq_s32(a2, a3);
+    int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+    int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+    int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+    int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
 
-    int32x4_t c0 = vhaddq_s32(b0, b2);
-    int32x4_t c1 = vhaddq_s32(b1, b3);
-    int32x4_t c2 = vhsubq_s32(b0, b2);
-    int32x4_t c3 = vhsubq_s32(b1, b3);
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
 
     store_s32q_to_tran_low(coeff + 4 * i, c0);
     store_s32q_to_tran_low(coeff + 4 * i + 256, c1);

From dcb91aa3ddc21586a9bc9d3ee9c270ad1b1a5fe1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 29 Jun 2023 09:52:26 -0700
Subject: [PATCH 761/926] mfqe_partition: fix -Wunreachable-code

vp9/common/vp9_mfqe.c|240 col 16| warning: code will never be executed
[-Wunreachable-code]
 BLOCK_SIZE mfqe_bs, bs_tmp;
            ^~~~~~~

Change-Id: I566b20d8c294e19bc4b90b57b730f933048e71a5
---
 vp9/common/vp9_mfqe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index e76d771b8d..cf60fa40fd 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -217,6 +217,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
   const int bsl = b_width_log2_lookup[bs];
   PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
   const BLOCK_SIZE subsize = get_subsize(bs, partition);
+  BLOCK_SIZE mfqe_bs, bs_tmp;
 
   if (cur_bs < BLOCK_8X8) {
     // If there are blocks smaller than 8x8, it must be on the boundary.
@@ -236,7 +237,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
     uv_offset = 8;
   }
   switch (partition) {
-    BLOCK_SIZE mfqe_bs, bs_tmp;
     case PARTITION_HORZ:
       if (bs == BLOCK_64X64) {
         mfqe_bs = BLOCK_64X32;

From f30532a6d9d7f86b1a3431aa0d4dc658d1e3269a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 10 Jul 2023 10:06:13 -0700
Subject: [PATCH 762/926] vpx_free_tpl_gop_stats: normalize param name

this fixes a clang-tidy warning

Change-Id: I13f4750c15b7d6a395494c8dbcb896bde125b3c4
---
 vpx/src/vpx_tpl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c
index 9cdb4a0a06..62c2a9c857 100644
--- a/vpx/src/vpx_tpl.c
+++ b/vpx/src/vpx_tpl.c
@@ -97,11 +97,11 @@ vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
   return VPX_CODEC_OK;
 }
 
-void vpx_free_tpl_gop_stats(VpxTplGopStats *data) {
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats) {
   int frame;
-  if (data == NULL) return;
-  for (frame = 0; frame < data->size; frame++) {
-    vpx_free(data->frame_stats_list[frame].block_stats_list);
+  if (tpl_gop_stats == NULL) return;
+  for (frame = 0; frame < tpl_gop_stats->size; frame++) {
+    vpx_free(tpl_gop_stats->frame_stats_list[frame].block_stats_list);
   }
-  vpx_free(data->frame_stats_list);
+  vpx_free(tpl_gop_stats->frame_stats_list);
 }

From e9b9972ca41115b74d26b8a012606c53c837ee95 Mon Sep 17 00:00:00 2001
From: "L. E. Segovia" <amy@amyspark.me>
Date: Sat, 8 Jul 2023 20:30:49 -0300
Subject: [PATCH 763/926] vp8: remove missing prototypes from the rtcd header

These were removed in If7a49e920e12f7fca0541190b87e6dae510df05c but
the leftovers can cause a build to fail if the code isn't optimized out.
I just found this out in the Meson port of libvpx for GStreamer.

BUG=webm:1584

Change-Id: I1c953720a2cbec3796200d4ec4020dca0b672bfb
---
 vp8/common/rtcd_defs.pl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 739a612847..12b474d939 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -127,12 +127,6 @@ ()
 #
 if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 
-    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
-    add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
-    add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
     specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
 

From 1d1ee888d39d288017c360abb523c2686ff56da8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 13 Jul 2023 09:49:30 -0700
Subject: [PATCH 764/926] vp9_rdopt,handle_inter_mode: fix
 -Wmaybe-uninitialized warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc 13.1.1

In function ‘handle_inter_mode’,
inlined from ‘vp9_rd_pick_inter_mode_sb’ at
    ../vp9/encoder/vp9_rdopt.c:3872:17:
../vp9/encoder/vp9_rdopt.c:3142:8: warning: ‘tmp_rd’ may be used
    uninitialized [-Wmaybe-uninitialized]
 3142 |     rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
../vp9/encoder/vp9_rdopt.c: In function ‘vp9_rd_pick_inter_mode_sb’:
../vp9/encoder/vp9_rdopt.c:2846:15: note: ‘tmp_rd’ was declared here
 2846 |   int64_t rd, tmp_rd, best_rd = INT64_MAX;

Change-Id: I8608957cc8bbeb1ae525f3c3dad6fe9785b2a9b4
---
 vp9/encoder/vp9_rdopt.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index b7fb26de27..7b607b643a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2841,9 +2841,8 @@ static int64_t handle_inter_mode(
 #else
   DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  int pred_exists = 0;
   int intpel_mv;
-  int64_t rd, tmp_rd, best_rd = INT64_MAX;
+  int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX;
   int best_needs_copy = 0;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
@@ -3003,7 +3002,6 @@ static int64_t handle_inter_mode(
       mi->mode != NEARESTMV)
     return INT64_MAX;
 
-  pred_exists = 0;
   // Are all MVs integer pel for Y and UV
   intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
   if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
@@ -3111,7 +3109,6 @@ static int64_t handle_inter_mode(
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
             (cm->interp_filter != SWITCHABLE &&
              cm->interp_filter == mi->interp_filter)) {
-          pred_exists = 1;
           tmp_rd = best_rd;
 
           skip_txfm_sb = tmp_skip_sb;
@@ -3131,7 +3128,7 @@ static int64_t handle_inter_mode(
       cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
 
-  if (pred_exists) {
+  if (tmp_rd != INT64_MAX) {
     if (best_needs_copy) {
       // again temporarily set the buffers to local memory to prevent a memcpy
       for (i = 0; i < MAX_MB_PLANE; i++) {

From 37200b6abb9cb4989a3c955a2a8a9b60df7e6245 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 24 Jul 2023 13:08:05 -0400
Subject: [PATCH 765/926] cleanup: _pt -> _ptr in vp9 external RC interface

Change-Id: Ic483488f8f6273e8977cfc324466bda41f1e47a7
---
 vpx/vpx_ext_ratectrl.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 3c5fc8cfc3..43816459ef 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -385,13 +385,13 @@ typedef struct vpx_rc_gop_decision {
  * This callback is invoked by the encoder to create an external rate control
  * model.
  *
- * \param[in]  priv               Callback's private data
- * \param[in]  ratectrl_config    Pointer to vpx_rc_config_t
- * \param[out] rate_ctrl_model_pt Pointer to vpx_rc_model_t
+ * \param[in]  priv                Callback's private data
+ * \param[in]  ratectrl_config     Pointer to vpx_rc_config_t
+ * \param[out] rate_ctrl_model_ptr Pointer to vpx_rc_model_t
  */
 typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)(
     void *priv, const vpx_rc_config_t *ratectrl_config,
-    vpx_rc_model_t *rate_ctrl_model_pt);
+    vpx_rc_model_t *rate_ctrl_model_ptr);
 
 /*!\brief Send first pass stats to the external rate control model callback
  * prototype

From e1c124f8965f166d3e9ca26c9215ebc3ec3a1d72 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 24 Jul 2023 18:04:58 -0400
Subject: [PATCH 766/926] Add new_mv_count to ext rate control interface

Bug: b/290385227
Change-Id: Ia87c4bf1e9315bf1134c998f88e9d5548c497777
---
 vp9/encoder/vp9_ext_ratectrl.c | 1 +
 vpx/vpx_ext_ratectrl.h         | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index b08fd63c3c..09253403b8 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -92,6 +92,7 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
   rc_frame_stats->mv_in_out_count = stats->mv_in_out_count;
   rc_frame_stats->duration = stats->duration;
   rc_frame_stats->count = stats->count;
+  rc_frame_stats->new_mv_count = stats->new_mv_count;
 }
 
 vpx_codec_err_t vp9_extrc_send_firstpass_stats(
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 43816459ef..2c312858b8 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -271,6 +271,10 @@ typedef struct vpx_rc_frame_stats {
    * number of frames whose stats are accumulated.
    */
   double count;
+  /*!
+   * Number of new mv in a frame.
+   */
+  double new_mv_count;
 } vpx_rc_frame_stats_t;
 
 /*!\brief Collection of first pass frame stats

From 5fb280ebb9b023575829876bede174f2098e6f9e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 7 Jul 2023 19:14:59 -0700
Subject: [PATCH 767/926] test,AbstractBench: fix -Wnon-virtual-dtor

In file included from ../test/bench.cc:14:
../test/bench.h:17:7: warning: 'AbstractBench' has virtual functions but
non-virtual destructor [-Wnon-virtual-dtor]
class AbstractBench {

Change-Id: Ibbfb949b63c8dff936c7ed4f2d056dea0343377b
---
 configure    | 1 +
 test/bench.h | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/configure b/configure
index 67bba946f6..aef65a8505 100755
--- a/configure
+++ b/configure
@@ -679,6 +679,7 @@ process_toolchain() {
         check_add_cxxflags -Wc++14-extensions
         check_add_cxxflags -Wc++17-extensions
         check_add_cxxflags -Wc++20-extensions
+        check_add_cxxflags -Wnon-virtual-dtor
 
         # disable some warnings specific to libyuv / libwebm.
         check_cxxflags -Wno-missing-declarations \
diff --git a/test/bench.h b/test/bench.h
index 57ca9118ba..203e4d247e 100644
--- a/test/bench.h
+++ b/test/bench.h
@@ -16,6 +16,8 @@
 
 class AbstractBench {
  public:
+  virtual ~AbstractBench() = default;
+
   void RunNTimes(int n);
   void PrintMedian(const char *title);
 

From 84e6b7ab02771fbf27a623944b511367bf5aacbd Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 24 Jul 2023 17:23:23 -0700
Subject: [PATCH 768/926] test/*.cc: prefer 'override' to 'virtual'

created with clang-tidy --fix --checks=-*,modernize-use-override

Change-Id: I71e1b4423c143b3e47fe90929ee110b307cdb565
---
 test/active_map_refresh_test.cc          |  8 ++--
 test/active_map_test.cc                  |  8 ++--
 test/add_noise_test.cc                   |  4 +-
 test/alt_ref_aq_segment_test.cc          |  8 ++--
 test/altref_test.cc                      | 22 ++++-----
 test/aq_segment_test.cc                  |  8 ++--
 test/avg_test.cc                         | 20 ++++----
 test/blockiness_test.cc                  |  4 +-
 test/borders_test.cc                     | 10 ++--
 test/byte_alignment_test.cc              |  4 +-
 test/comp_avg_pred_test.cc               |  2 +-
 test/config_test.cc                      | 10 ++--
 test/consistency_test.cc                 |  4 +-
 test/convolve_test.cc                    |  4 +-
 test/cpu_speed_test.cc                   | 12 ++---
 test/cq_test.cc                          | 14 +++---
 test/dct16x16_test.cc                    | 31 +++++++------
 test/dct32x32_test.cc                    | 14 +++---
 test/dct_partial_test.cc                 |  2 +-
 test/dct_test.cc                         |  4 +-
 test/decode_corrupted.cc                 | 22 ++++-----
 test/decode_perf_test.cc                 | 16 +++----
 test/decode_svc_test.cc                  | 11 ++---
 test/encode_perf_test.cc                 | 14 +++---
 test/error_resilience_test.cc            | 26 +++++------
 test/external_frame_buffer_test.cc       | 17 ++++---
 test/fdct8x8_test.cc                     | 31 +++++++------
 test/frame_size_tests.cc                 | 16 +++----
 test/hadamard_test.cc                    |  6 +--
 test/idct_test.cc                        |  4 +-
 test/invalid_file_test.cc                | 15 +++---
 test/keyframe_test.cc                    | 10 ++--
 test/level_test.cc                       |  8 ++--
 test/lpf_test.cc                         | 12 ++---
 test/minmax_test.cc                      |  2 +-
 test/partial_idct_test.cc                |  6 +--
 test/pp_filter_test.cc                   | 12 ++---
 test/predict_test.cc                     |  6 +--
 test/quantize_test.cc                    |  4 +-
 test/resize_test.cc                      | 58 ++++++++++++------------
 test/sad_test.cc                         |  4 +-
 test/sum_squares_test.cc                 |  6 +--
 test/superframe_test.cc                  | 14 +++---
 test/svc_datarate_test.cc                | 44 +++++++++---------
 test/svc_end_to_end_test.cc              | 54 +++++++++++-----------
 test/test_vector_test.cc                 | 11 ++---
 test/tile_independence_test.cc           | 10 ++--
 test/timestamp_test.cc                   | 10 ++--
 test/variance_test.cc                    | 10 ++--
 test/vp8_datarate_test.cc                | 14 +++---
 test/vp8_denoiser_sse2_test.cc           |  6 +--
 test/vp8_fdct4x4_test.cc                 |  2 +-
 test/vp8_fragments_test.cc               |  4 +-
 test/vp8_ratectrl_rtc_test.cc            | 12 ++---
 test/vp9_arf_freq_test.cc                | 12 ++---
 test/vp9_block_error_test.cc             |  6 +--
 test/vp9_datarate_test.cc                | 40 ++++++++--------
 test/vp9_denoiser_test.cc                |  6 +--
 test/vp9_encoder_parms_get_to_decoder.cc | 14 +++---
 test/vp9_end_to_end_test.cc              | 32 ++++++-------
 test/vp9_ethread_test.cc                 | 36 +++++++--------
 test/vp9_intrapred_test.cc               |  2 +-
 test/vp9_lossless_test.cc                | 12 ++---
 test/vp9_motion_vector_test.cc           |  8 ++--
 test/vp9_quantize_test.cc                |  4 +-
 test/vp9_ratectrl_rtc_test.cc            | 26 +++++------
 test/vp9_roi_test.cc                     |  8 ++--
 test/vp9_scale_test.cc                   |  4 +-
 test/vp9_subtract_test.cc                |  8 ++--
 test/vp9_thread_test.cc                  |  6 +--
 test/vpx_scale_test.cc                   |  8 ++--
 test/y4m_test.cc                         |  6 +--
 test/yuv_temporal_filter_test.cc         |  2 +-
 73 files changed, 464 insertions(+), 466 deletions(-)

diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc
index 68d8856eaa..8b35ca81ba 100644
--- a/test/active_map_refresh_test.cc
+++ b/test/active_map_refresh_test.cc
@@ -62,16 +62,16 @@ class ActiveMapRefreshTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapRefreshTest() {}
+  ~ActiveMapRefreshTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     ::libvpx_test::Y4mVideoSource *y4m_video =
         static_cast<libvpx_test::Y4mVideoSource *>(video);
     if (video->frame() == 0) {
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 543ec0d358..1f661b559c 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -26,16 +26,16 @@ class ActiveMapTest
   static const int kHeight = 144;
 
   ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapTest() {}
+  ~ActiveMapTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3));
diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
index 7dc86e3eb6..6e787dd6ba 100644
--- a/test/add_noise_test.cc
+++ b/test/add_noise_test.cc
@@ -32,8 +32,8 @@ typedef std::tuple<double, AddNoiseFunc> AddNoiseTestFPParam;
 class AddNoiseTest : public ::testing::Test,
                      public ::testing::WithParamInterface<AddNoiseTestFPParam> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-  virtual ~AddNoiseTest() {}
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+  ~AddNoiseTest() override {}
 };
 
 double stddev6(char a, char b, char c, char d, char e, char f) {
diff --git a/test/alt_ref_aq_segment_test.cc b/test/alt_ref_aq_segment_test.cc
index 00a00e27c5..b64fc3cd0b 100644
--- a/test/alt_ref_aq_segment_test.cc
+++ b/test/alt_ref_aq_segment_test.cc
@@ -20,9 +20,9 @@ class AltRefAqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AltRefAqSegmentTest() {}
+  ~AltRefAqSegmentTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
@@ -30,8 +30,8 @@ class AltRefAqSegmentTest
     alt_ref_aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_);
diff --git a/test/altref_test.cc b/test/altref_test.cc
index 69bcef774e..69b2b87a2a 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -24,24 +24,24 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
                    public ::libvpx_test::CodecTestWithParam<int> {
  protected:
   AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {}
-  virtual ~AltRefTest() {}
+  ~AltRefTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { altref_count_ = 0; }
+  void BeginPassHook(unsigned int /*pass*/) override { altref_count_ = 0; }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_CPUUSED, 3);
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_;
   }
 
@@ -75,17 +75,17 @@ class AltRefForcedKeyTestLarge
   AltRefForcedKeyTestLarge()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
-  virtual ~AltRefForcedKeyTestLarge() {}
+  ~AltRefForcedKeyTestLarge() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     cfg_.rc_end_usage = VPX_VBR;
     cfg_.g_threads = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -100,7 +100,7 @@ class AltRefForcedKeyTestLarge
         (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame_num_ == forced_kf_frame_num_) {
       ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY))
           << "Frame #" << frame_num_ << " isn't a keyframe!";
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 2cbc991d0c..c98b8de094 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -20,17 +20,17 @@ class AqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AqSegmentTest() {}
+  ~AqSegmentTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
     aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
diff --git a/test/avg_test.cc b/test/avg_test.cc
index a0428304a2..dbd3309ee4 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -38,7 +38,7 @@ class AverageTestBase : public ::testing::Test {
       : width_(width), height_(height), source_data_(nullptr),
         source_stride_(0), bit_depth_(8) {}
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data_);
     source_data_ = nullptr;
     libvpx_test::ClearSystemState();
@@ -49,7 +49,7 @@ class AverageTestBase : public ::testing::Test {
   static const int kDataAlignment = 16;
   static const int kDataBlockSize = 64 * 128;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_data_ = reinterpret_cast<Pixel *>(
         vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
     ASSERT_NE(source_data_, nullptr);
@@ -169,7 +169,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     source_data_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
     ASSERT_NE(source_data_, nullptr);
@@ -180,7 +180,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
         vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data_);
     source_data_ = nullptr;
     vpx_free(hbuf_c_);
@@ -238,7 +238,7 @@ typedef std::tuple<int, SatdFunc> SatdTestParam;
 class SatdTest : public ::testing::Test,
                  public ::testing::WithParamInterface<SatdTestParam> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     satd_size_ = GET_PARAM(0);
     satd_func_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -247,7 +247,7 @@ class SatdTest : public ::testing::Test,
     ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     libvpx_test::ClearSystemState();
     vpx_free(src_);
   }
@@ -276,7 +276,7 @@ class SatdTest : public ::testing::Test,
 
 class SatdLowbdTest : public SatdTest {
  protected:
-  virtual void FillRandom() {
+  void FillRandom() override {
     for (int i = 0; i < satd_size_; ++i) {
       const int16_t tmp = rnd_.Rand16Signed();
       src_[i] = (tran_low_t)tmp;
@@ -292,7 +292,7 @@ class BlockErrorTestFP
     : public ::testing::Test,
       public ::testing::WithParamInterface<BlockErrorTestFPParam> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     txfm_size_ = GET_PARAM(0);
     block_error_func_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -304,7 +304,7 @@ class BlockErrorTestFP
     ASSERT_NE(dqcoeff_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     libvpx_test::ClearSystemState();
     vpx_free(coeff_);
     vpx_free(dqcoeff_);
@@ -463,7 +463,7 @@ TEST_P(SatdLowbdTest, DISABLED_Speed) {
 #if CONFIG_VP9_HIGHBITDEPTH
 class SatdHighbdTest : public SatdTest {
  protected:
-  virtual void FillRandom() {
+  void FillRandom() override {
     for (int i = 0; i < satd_size_; ++i) {
       src_[i] = rnd_.Rand20Signed();
     }
diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc
index 11b2a3f61f..5a45bc0b7f 100644
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -49,14 +49,14 @@ class BlockinessTestBase : public ::testing::Test {
     reference_data_ = nullptr;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   // Handle frames up to 640x480
   static const int kDataAlignment = 16;
   static const int kDataBufferSize = 640 * 480;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
diff --git a/test/borders_test.cc b/test/borders_test.cc
index 3c1f69a923..009121bf22 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -22,15 +22,15 @@ class BordersTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   BordersTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~BordersTest() {}
+  ~BordersTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 1);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -40,7 +40,7 @@ class BordersTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
     }
   }
diff --git a/test/byte_alignment_test.cc b/test/byte_alignment_test.cc
index 1e0ffceb8d..ba6fffc524 100644
--- a/test/byte_alignment_test.cc
+++ b/test/byte_alignment_test.cc
@@ -58,7 +58,7 @@ class ByteAlignmentTest
   ByteAlignmentTest()
       : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
     ASSERT_NE(video_, nullptr);
     video_->Init();
@@ -71,7 +71,7 @@ class ByteAlignmentTest
     OpenMd5File(kVP9Md5File);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (md5_file_ != nullptr) fclose(md5_file_);
 
     delete decoder_;
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index d8fabd5bef..3234cc9a25 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -49,7 +49,7 @@ using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h,
 template <int bitdepth, typename Pixel>
 class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     avg_pred_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
diff --git a/test/config_test.cc b/test/config_test.cc
index 8f4c60e113..a476d580a5 100644
--- a/test/config_test.cc
+++ b/test/config_test.cc
@@ -22,24 +22,24 @@ class ConfigTest
   ConfigTest()
       : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0),
         frame_count_max_(0) {}
-  virtual ~ConfigTest() {}
+  ~ConfigTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     frame_count_in_ = 0;
     frame_count_out_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) override {
     ++frame_count_in_;
     abort_ |= (frame_count_in_ >= frame_count_max_);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+  void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override {
     ++frame_count_out_;
   }
 
diff --git a/test/consistency_test.cc b/test/consistency_test.cc
index f0e2cb297e..5e872e70a8 100644
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -65,14 +65,14 @@ class ConsistencyTestBase : public ::testing::Test {
     delete[] ssim_array_;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   // Handle frames up to 640x480
   static const int kDataAlignment = 16;
   static const int kDataBufferSize = 640 * 480;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 5a17d80894..4d27c5ffcf 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -361,7 +361,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 #endif
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
   static void TearDownTestSuite() {
     vpx_free(input_ - 1);
@@ -403,7 +403,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
             i % kOuterBlockSize >= (BorderLeft() + Width()));
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     UUT_ = GET_PARAM(2);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ != 0) {
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index a7623f09ac..78999ce658 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -26,9 +26,9 @@ class CpuSpeedTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
         tune_content_(VP9E_CONTENT_DEFAULT) {}
-  virtual ~CpuSpeedTest() {}
+  ~CpuSpeedTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -40,10 +40,10 @@ class CpuSpeedTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; }
+  void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
@@ -56,7 +56,7 @@ class CpuSpeedTest
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
   }
 
diff --git a/test/cq_test.cc b/test/cq_test.cc
index 292adb0d04..a9a16aae13 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -50,21 +50,21 @@ class CQTest : public ::libvpx_test::EncoderTest,
     init_flags_ = VPX_CODEC_USE_PSNR;
   }
 
-  virtual ~CQTest() {}
+  ~CQTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     file_size_ = 0;
     psnr_ = 0.0;
     n_frames_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       if (cfg_.rc_end_usage == VPX_CQ) {
         encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_);
@@ -73,12 +73,12 @@ class CQTest : public ::libvpx_test::EncoderTest,
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0);
     n_frames_++;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     file_size_ += pkt->data.frame.sz;
   }
 
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 4ad2263cfc..de98d99731 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -728,9 +728,9 @@ class Trans16x16TestBase {
 class Trans16x16DCT : public Trans16x16TestBase,
                       public ::testing::TestWithParam<Dct16x16Param> {
  public:
-  virtual ~Trans16x16DCT() {}
+  ~Trans16x16DCT() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -749,13 +749,13 @@ class Trans16x16DCT : public Trans16x16TestBase,
     inv_txfm_ref = idct16x16_ref;
 #endif
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -782,9 +782,9 @@ TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); }
 class Trans16x16HT : public Trans16x16TestBase,
                      public ::testing::TestWithParam<Ht16x16Param> {
  public:
-  virtual ~Trans16x16HT() {}
+  ~Trans16x16HT() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -803,13 +803,13 @@ class Trans16x16HT : public Trans16x16TestBase,
     inv_txfm_ref = iht16x16_ref;
 #endif
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -832,9 +832,9 @@ TEST_P(Trans16x16HT, QuantCheck) {
 class InvTrans16x16DCT : public Trans16x16TestBase,
                          public ::testing::TestWithParam<Idct16x16Param> {
  public:
-  virtual ~InvTrans16x16DCT() {}
+  ~InvTrans16x16DCT() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     thresh_ = GET_PARAM(2);
@@ -842,11 +842,12 @@ class InvTrans16x16DCT : public Trans16x16TestBase,
     pitch_ = 16;
     mask_ = (1 << bit_depth_) - 1;
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/,
+                  int /*stride*/) override {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 1167038b5f..62547ac537 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -89,8 +89,8 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
 class Trans32x32Test : public AbstractBench,
                        public ::testing::TestWithParam<Trans32x32Param> {
  public:
-  virtual ~Trans32x32Test() {}
-  virtual void SetUp() {
+  ~Trans32x32Test() override {}
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     version_ = GET_PARAM(2);  // 0: high precision forward transform
@@ -99,7 +99,7 @@ class Trans32x32Test : public AbstractBench,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int version_;
@@ -110,7 +110,7 @@ class Trans32x32Test : public AbstractBench,
 
   int16_t *bench_in_;
   tran_low_t *bench_out_;
-  virtual void Run();
+  void Run() override;
 };
 
 void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); }
@@ -321,8 +321,8 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
 
 class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> {
  public:
-  virtual ~InvTrans32x32Test() {}
-  virtual void SetUp() {
+  ~InvTrans32x32Test() override {}
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     version_ = GET_PARAM(2);  // 0: high precision forward transform
@@ -334,7 +334,7 @@ class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> {
     pitch_ = 32;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) {
diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc
index e57fa0f48b..ec6f543f71 100644
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@@ -67,7 +67,7 @@ class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   void RunTest() {
diff --git a/test/dct_test.cc b/test/dct_test.cc
index 235c407237..c3d3081c42 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -134,7 +134,7 @@ void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
 
 class TransTestBase : public ::testing::TestWithParam<DctParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     const int idx = GET_PARAM(0);
     const FuncInfo *func_info = &(GET_PARAM(1)[idx]);
@@ -166,7 +166,7 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
     ASSERT_NE(dst_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(src_);
     src_ = nullptr;
     vpx_free(dst_);
diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc
index 31e1da69cc..a9a2cc6e70 100644
--- a/test/decode_corrupted.cc
+++ b/test/decode_corrupted.cc
@@ -28,9 +28,9 @@ class DecodeCorruptedFrameTest
   DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {}
 
  protected:
-  virtual ~DecodeCorruptedFrameTest() {}
+  ~DecodeCorruptedFrameTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_lag_in_frames = 0;
@@ -44,16 +44,16 @@ class DecodeCorruptedFrameTest
     dec_cfg_.threads = 1;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7);
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {}
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {}
 
-  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
-      const vpx_codec_cx_pkt_t *pkt) {
+  const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) override {
     // Don't edit frame packet on key frame.
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt;
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
@@ -66,9 +66,9 @@ class DecodeCorruptedFrameTest
     return &modified_pkt_;
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError();
     return VPX_CODEC_MEM_ERROR != res_dec;
   }
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index e07a667440..7533778e82 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -118,9 +118,9 @@ class VP9NewEncodeDecodePerfTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
         outfile_(0), out_frames_(0) {}
 
-  virtual ~VP9NewEncodeDecodePerfTest() {}
+  ~VP9NewEncodeDecodePerfTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -137,8 +137,8 @@ class VP9NewEncodeDecodePerfTest
     cfg_.rc_end_usage = VPX_VBR;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, speed_);
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
@@ -146,14 +146,14 @@ class VP9NewEncodeDecodePerfTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH");
     const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
     outfile_ = fopen(path_to_source.c_str(), "wb");
     ASSERT_NE(outfile_, nullptr);
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     if (outfile_ != nullptr) {
       if (!fseek(outfile_, 0, SEEK_SET)) {
         ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
@@ -163,7 +163,7 @@ class VP9NewEncodeDecodePerfTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -177,7 +177,7 @@ class VP9NewEncodeDecodePerfTest
               pkt->data.frame.sz);
   }
 
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   void set_speed(unsigned int speed) { speed_ = speed; }
 
diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc
index ec9935da79..29e9bd06f5 100644
--- a/test/decode_svc_test.cc
+++ b/test/decode_svc_test.cc
@@ -25,17 +25,16 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest,
                       public ::libvpx_test::CodecTestWithParam<const char *> {
  protected:
   DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {}
-  virtual ~DecodeSvcTest() {}
+  ~DecodeSvcTest() override {}
 
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (video.frame_number() == 0)
       decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_);
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_EQ(img.d_w, width_);
     ASSERT_EQ(img.d_h, height_);
     total_frames_ = frame_number;
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 142a55952b..5f9c58dc94 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -61,9 +61,9 @@ class VP9EncodePerfTest
       : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
 
-  virtual ~VP9EncodePerfTest() {}
+  ~VP9EncodePerfTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -82,8 +82,8 @@ class VP9EncodePerfTest
     cfg_.g_threads = threads_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       const int log2_tile_columns = 3;
       encoder->Control(VP8E_SET_CPUUSED, speed_);
@@ -93,19 +93,19 @@ class VP9EncodePerfTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     min_psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) {
       min_psnr_ = pkt->data.psnr.psnr[0];
     }
   }
 
   // for performance reasons don't decode
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   double min_psnr() const { return min_psnr_; }
 
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 45138f14b9..622c3c4461 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -30,7 +30,7 @@ class ErrorResilienceTestLarge
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLarge() {}
+  ~ErrorResilienceTestLarge() override {}
 
   void Reset() {
     error_nframes_ = 0;
@@ -38,19 +38,19 @@ class ErrorResilienceTestLarge
     pattern_switch_ = 0;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = 0.0;
     nframes_ = 0;
     mismatch_psnr_ = 0.0;
     mismatch_nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
@@ -90,7 +90,7 @@ class ErrorResilienceTestLarge
     return frame_flags;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video) override {
     frame_flags_ &=
         ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF);
     // For temporal layer case.
@@ -129,7 +129,7 @@ class ErrorResilienceTestLarge
     return 0.0;
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     if (error_nframes_ > 0 &&
         (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < error_nframes_; ++i) {
@@ -143,7 +143,7 @@ class ErrorResilienceTestLarge
     return 1;
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
@@ -381,7 +381,7 @@ class ErrorResilienceTestLargeCodecControls
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLargeCodecControls() {}
+  ~ErrorResilienceTestLargeCodecControls() override {}
 
   void Reset() {
     last_pts_ = 0;
@@ -393,7 +393,7 @@ class ErrorResilienceTestLargeCodecControls
     duration_ = 0.0;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
@@ -460,8 +460,8 @@ class ErrorResilienceTestLargeCodecControls
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (cfg_.ts_number_layers > 1) {
       int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
       int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
@@ -476,7 +476,7 @@ class ErrorResilienceTestLargeCodecControls
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
     if (duration > 1) {
@@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     duration_ = (last_pts_ + 1) * timebase_;
     if (cfg_.ts_number_layers > 1) {
       for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 3bd4a1c473..7b9a836fbc 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -210,13 +210,12 @@ class ExternalFrameBufferMD5Test
       : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
         md5_file_(nullptr), num_buffers_(0) {}
 
-  virtual ~ExternalFrameBufferMD5Test() {
+  ~ExternalFrameBufferMD5Test() override {
     if (md5_file_ != nullptr) fclose(md5_file_);
   }
 
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (num_buffers_ > 0 && video.frame_number() == 0) {
       // Have libvpx use frame buffers we create.
       ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
@@ -232,8 +231,8 @@ class ExternalFrameBufferMD5Test
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
@@ -289,7 +288,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
   ExternalFrameBufferTest()
       : video_(nullptr), decoder_(nullptr), num_buffers_(0) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
     ASSERT_NE(video_, nullptr);
     video_->Init();
@@ -300,7 +299,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
     ASSERT_NE(decoder_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete decoder_;
     decoder_ = nullptr;
     delete video_;
@@ -355,7 +354,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
 
 class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile);
     ASSERT_NE(video_, nullptr);
     video_->Init();
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 21f8dcffa0..ba9db00120 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -539,9 +539,9 @@ class FwdTrans8x8TestBase {
 class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Dct8x8Param> {
  public:
-  virtual ~FwdTrans8x8DCT() {}
+  ~FwdTrans8x8DCT() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -551,13 +551,13 @@ class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -578,9 +578,9 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 class FwdTrans8x8HT : public FwdTrans8x8TestBase,
                       public ::testing::TestWithParam<Ht8x8Param> {
  public:
-  virtual ~FwdTrans8x8HT() {}
+  ~FwdTrans8x8HT() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -590,13 +590,13 @@ class FwdTrans8x8HT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -614,9 +614,9 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); }
 class InvTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Idct8x8Param> {
  public:
-  virtual ~InvTrans8x8DCT() {}
+  ~InvTrans8x8DCT() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     thresh_ = GET_PARAM(2);
@@ -625,13 +625,14 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
-  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {}
+  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/,
+                  int /*stride*/) override {}
 
   IdctFunc ref_txfm_;
   IdctFunc inv_txfm_;
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 8a0eb71ba0..266858eebb 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -65,7 +65,7 @@ class EncoderWithExpectedError : public ::libvpx_test::Encoder {
     ASSERT_EQ(expected_err, res) << EncoderError();
   }
 
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
@@ -79,22 +79,22 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
  protected:
   VP9FrameSizeTestsLarge()
       : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {}
-  virtual ~VP9FrameSizeTestsLarge() {}
+  ~VP9FrameSizeTestsLarge() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 9ba898b519..b22bae87cc 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -130,7 +130,7 @@ std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
 
 class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     h_func_ = GetParam().func;
     bwh_ = GetParam().block_size;
     block_size_ = bwh_ * bwh_;
@@ -252,7 +252,7 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
 class HadamardLowbdTest : public HadamardTestBase {
  protected:
   // Use values between -255 (0xFF01) and 255 (0x00FF)
-  virtual int16_t Rand() {
+  int16_t Rand() override {
     int16_t src = rnd_.Rand8();
     int16_t pred = rnd_.Rand8();
     return src - pred;
@@ -335,7 +335,7 @@ INSTANTIATE_TEST_SUITE_P(
 class HadamardHighbdTest : public HadamardTestBase {
  protected:
   // Use values between -4095 (0xF001) and 4095 (0x0FFF)
-  virtual int16_t Rand() {
+  int16_t Rand() override {
     int16_t src = rnd_.Rand12();
     int16_t pred = rnd_.Rand12();
     return src - pred;
diff --git a/test/idct_test.cc b/test/idct_test.cc
index 1b9532e1c1..279e58e2aa 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -27,7 +27,7 @@ using libvpx_test::Buffer;
 
 class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     UUT = GetParam();
 
     input = new Buffer<int16_t>(4, 4, 0);
@@ -41,7 +41,7 @@ class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
     ASSERT_TRUE(output->Init());
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete input;
     delete predict;
     delete output;
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 762d585f59..c37dc0d486 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -40,7 +40,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
  protected:
   InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {}
 
-  virtual ~InvalidFileTest() {
+  ~InvalidFileTest() override {
     if (res_file_ != nullptr) fclose(res_file_);
   }
 
@@ -50,10 +50,9 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
         << "Result file open failed. Filename: " << res_file_name_;
   }
 
-  virtual bool HandleDecodeResult(
-      const vpx_codec_err_t res_dec,
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_NE(res_file_, nullptr);
     int expected_res_dec;
 
@@ -172,9 +171,9 @@ VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest,
 class InvalidFileInvalidPeekTest : public InvalidFileTest {
  protected:
   InvalidFileInvalidPeekTest() : InvalidFileTest() {}
-  virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
-                                libvpx_test::CompressedVideoSource * /*video*/,
-                                const vpx_codec_err_t /*res_peek*/) {}
+  void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+                        libvpx_test::CompressedVideoSource * /*video*/,
+                        const vpx_codec_err_t /*res_peek*/) override {}
 };
 
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); }
diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc
index a13dec9ce2..d624cb19d6 100644
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -22,9 +22,9 @@ class KeyframeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   KeyframeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~KeyframeTest() {}
+  ~KeyframeTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     kf_count_ = 0;
@@ -33,8 +33,8 @@ class KeyframeTest
     set_cpu_used_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (kf_do_force_kf_) {
       frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF;
     }
@@ -43,7 +43,7 @@ class KeyframeTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
       kf_pts_list_.push_back(pkt->data.frame.pts);
       kf_count_++;
diff --git a/test/level_test.cc b/test/level_test.cc
index 038d75f44f..8e653d93e1 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -22,9 +22,9 @@ class LevelTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0),
         level_(0) {}
-  virtual ~LevelTest() {}
+  ~LevelTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -41,8 +41,8 @@ class LevelTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_);
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 4cc99a6db4..2f04194dce 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -129,15 +129,15 @@ uint8_t GetHevThresh(ACMRandom *rnd) {
 
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
-  virtual ~Loop8Test6Param() {}
-  virtual void SetUp() {
+  ~Loop8Test6Param() override {}
+  void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int bit_depth_;
@@ -151,15 +151,15 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param);
     (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH)
 class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
  public:
-  virtual ~Loop8Test9Param() {}
-  virtual void SetUp() {
+  ~Loop8Test9Param() override {}
+  void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int bit_depth_;
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index e710af6991..b495709063 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -29,7 +29,7 @@ typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
 
 class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     mm_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index b7c0c050af..6593ac68e9 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -59,8 +59,8 @@ const int kCountTestBlock = 1000;
 
 class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
-  virtual ~PartialIDctTest() {}
-  virtual void SetUp() {
+  ~PartialIDctTest() override {}
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     fwd_txfm_ = GET_PARAM(0);
     full_inv_txfm_ = GET_PARAM(1);
@@ -100,7 +100,7 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
         vpx_memalign(16, pixel_size_ * output_block_size_));
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(input_block_);
     input_block_ = nullptr;
     vpx_free(output_block_);
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 27d5ffa907..d2db8a7c7d 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -51,10 +51,10 @@ class VpxPostProcDownAndAcrossMbRowTest
  public:
   VpxPostProcDownAndAcrossMbRowTest()
       : mb_post_proc_down_and_across_(GetParam()) {}
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_;
   // Size of the underlying data block that will be filtered.
@@ -227,10 +227,10 @@ class VpxMbPostProcAcrossIpTest
   VpxMbPostProcAcrossIpTest()
       : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()),
         src_(Buffer<uint8_t>(rows_, cols_, 8, 8, 17, 8)) {}
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   void SetCols(unsigned char *s, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
@@ -356,10 +356,10 @@ class VpxMbPostProcDownTest
       : rows_(16), cols_(16), mb_post_proc_down_(GetParam()),
         src_c_(Buffer<uint8_t>(rows_, cols_, 8, 8, 8, 17)) {}
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   void SetRows(unsigned char *src_c, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
diff --git a/test/predict_test.cc b/test/predict_test.cc
index 7472970576..fbf42077b3 100644
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -43,7 +43,7 @@ class PredictTestBase : public AbstractBench,
       : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
         src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     src_ = new uint8_t[kSrcSize];
     ASSERT_NE(src_, nullptr);
 
@@ -64,7 +64,7 @@ class PredictTestBase : public AbstractBench,
     memset(dst_c_, 0, 16 * 16);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete[] src_;
     src_ = nullptr;
     vpx_free(padded_dst_);
@@ -209,7 +209,7 @@ class PredictTestBase : public AbstractBench,
     }
   }
 
-  void Run() {
+  void Run() override {
     for (int xoffset = 0; xoffset < 8; ++xoffset) {
       for (int yoffset = 0; yoffset < 8; ++yoffset) {
         if (xoffset == 0 && yoffset == 0) {
diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index 57309e8102..ab38f5c1b0 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -121,13 +121,13 @@ class QuantizeTest : public QuantizeTestBase,
                      public ::testing::TestWithParam<VP8QuantizeParam>,
                      public AbstractBench {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     SetupCompressor();
     asm_quant_ = GET_PARAM(0);
     c_quant_ = GET_PARAM(1);
   }
 
-  virtual void Run() {
+  void Run() override {
     asm_quant_(&vp8_comp_->mb.block[0], &macroblockd_dst_->block[0]);
   }
 
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 715bb9d70f..3eb842f549 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -247,10 +247,10 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
   }
   bool flag_codec_;
   bool smaller_width_larger_size_;
-  virtual ~ResizingVideoSource() {}
+  ~ResizingVideoSource() override {}
 
  protected:
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     unsigned int width = 0;
     unsigned int height = 0;
@@ -267,14 +267,14 @@ class ResizeTest
  protected:
   ResizeTest() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~ResizeTest() {}
+  ~ResizeTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
     ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
     encode_frame_width_.push_back(pkt->data.frame.width[0]);
@@ -289,8 +289,8 @@ class ResizeTest
     return encode_frame_height_[idx];
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t pts) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
@@ -336,15 +336,15 @@ class ResizeInternalTest : public ResizeTest {
   ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeInternalTest() {}
+  ~ResizeInternalTest() override {}
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
@@ -355,8 +355,8 @@ class ResizeInternalTest : public ResizeTest {
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (change_config_) {
       int new_q = 60;
       if (video->frame() == 0) {
@@ -381,7 +381,7 @@ class ResizeInternalTest : public ResizeTest {
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
@@ -450,10 +450,10 @@ class ResizeRealtimeTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ResizeRealtimeTest() {}
+  ~ResizeRealtimeTest() override {}
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_AQ_MODE, 3);
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
@@ -466,24 +466,24 @@ class ResizeRealtimeTest
     }
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t pts) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
     ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
     encode_frame_width_.push_back(pkt->data.frame.width[0]);
@@ -693,15 +693,15 @@ class ResizeCspTest : public ResizeTest {
   ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeCspTest() {}
+  ~ResizeCspTest() override {}
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
@@ -712,8 +712,8 @@ class ResizeCspTest : public ResizeTest {
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 &&
         cfg_.g_profile != 1) {
       cfg_.g_profile = 1;
@@ -726,7 +726,7 @@ class ResizeCspTest : public ResizeTest {
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
@@ -758,10 +758,10 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource {
     limit_ = 30;
   }
 
-  virtual ~ResizingCspVideoSource() {}
+  ~ResizingCspVideoSource() override {}
 
  protected:
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     SetImageFormat(CspForFrameNumber(frame_));
     FillFrame();
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 92b3a14d68..83c4fe0c36 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -73,7 +73,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
  public:
   explicit SADTestBase(const ParamType &params) : params_(params) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_data8_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBlockSize));
     reference_data8_ = reinterpret_cast<uint8_t *>(
@@ -108,7 +108,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data8_);
     source_data8_ = nullptr;
     vpx_free(reference_data8_);
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index df6da84037..1943acf8b3 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -33,13 +33,13 @@ typedef std::tuple<SSI16Func, SSI16Func> SumSquaresParam;
 
 class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
  public:
-  virtual ~SumSquaresTest() {}
-  virtual void SetUp() {
+  ~SumSquaresTest() override {}
+  void SetUp() override {
     ref_func_ = GET_PARAM(0);
     tst_func_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   SSI16Func ref_func_;
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index a5c92e9147..4a522dd496 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -28,9 +28,9 @@ class SuperframeTest
  protected:
   SuperframeTest()
       : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {}
-  virtual ~SuperframeTest() {}
+  ~SuperframeTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     const SuperframeTestParam input = GET_PARAM(1);
     const libvpx_test::TestMode mode = std::get<kTestMode>(input);
@@ -39,17 +39,17 @@ class SuperframeTest
     sf_count_max_ = INT_MAX;
   }
 
-  virtual void TearDown() { delete[] modified_buf_; }
+  void TearDown() override { delete[] modified_buf_; }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
     }
   }
 
-  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
-      const vpx_codec_cx_pkt_t *pkt) {
+  const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
 
     const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index d571f50860..723538368f 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -43,7 +43,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   }
 
  protected:
-  virtual ~DatarateOnePassCbrSvc() {}
+  ~DatarateOnePassCbrSvc() override {}
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -86,7 +86,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
     ksvc_flex_noupd_tlenh_ = false;
   }
-  virtual void BeginPassHook(unsigned int /*pass*/) {}
+  void BeginPassHook(unsigned int /*pass*/) override {}
 
   // Example pattern for spatial layers and 2 temporal layers used in the
   // bypass/flexible mode. The pattern corresponds to the pattern
@@ -179,8 +179,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     PreEncodeFrameHookSetup(video, encoder);
 
     if (video->frame() == 0) {
@@ -468,7 +468,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     return VPX_CODEC_OK;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     uint32_t sizes[8] = { 0 };
     uint32_t sizes_parsed[8] = { 0 };
     int count = 0;
@@ -571,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_;
     duration_ = (last_pts_ + 1) * timebase_;
     for (int sl = 0; sl < number_spatial_layers_; ++sl) {
@@ -583,7 +583,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     // TODO(marpan): Look into why an assert is triggered in compute_psnr
     // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh.
     // Has to do with dropped frames in bypass/flexible svc mode.
@@ -639,7 +639,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   bool ksvc_flex_noupd_tlenh_;
 
  private:
-  virtual void SetConfig(const int num_temporal_layer) {
+  void SetConfig(const int num_temporal_layer) override {
     cfg_.rc_end_usage = VPX_CBR;
     cfg_.g_lag_in_frames = 0;
     cfg_.g_error_resilient = 1;
@@ -670,10 +670,10 @@ class DatarateOnePassCbrSvcSingleBR
   DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcSingleBR() {}
+  ~DatarateOnePassCbrSvcSingleBR() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1160,10 +1160,10 @@ class DatarateOnePassCbrSvcMultiBR
   DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcMultiBR() {}
+  ~DatarateOnePassCbrSvcMultiBR() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1243,10 +1243,10 @@ class DatarateOnePassCbrSvcFrameDropMultiBR
       : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcFrameDropMultiBR() {}
+  ~DatarateOnePassCbrSvcFrameDropMultiBR() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1355,10 +1355,10 @@ class DatarateOnePassCbrSvcInterLayerPredSingleBR
       : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcInterLayerPredSingleBR() {}
+  ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1441,10 +1441,10 @@ class DatarateOnePassCbrSvcDenoiser
   DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcDenoiser() {}
+  ~DatarateOnePassCbrSvcDenoiser() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1499,10 +1499,10 @@ class DatarateOnePassCbrSvcSmallKF
   DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcSmallKF() {}
+  ~DatarateOnePassCbrSvcSmallKF() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1702,10 +1702,10 @@ class DatarateOnePassCbrSvcPostencodeDrop
   DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcPostencodeDrop() {}
+  ~DatarateOnePassCbrSvcPostencodeDrop() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index 7300ce6679..c9ef35bbe6 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -45,19 +45,19 @@ class ScalePartitionOnePassCbrSvc
   }
 
  protected:
-  virtual ~ScalePartitionOnePassCbrSvc() {}
+  ~ScalePartitionOnePassCbrSvc() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     speed_setting_ = 7;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     PreEncodeFrameHookSetup(video, encoder);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
     // for TL > 0.
@@ -67,12 +67,12 @@ class ScalePartitionOnePassCbrSvc
       num_nonref_frames_++;
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     ++mismatch_nframes_;
   }
 
-  virtual void SetConfig(const int /*num_temporal_layer*/) {}
+  void SetConfig(const int /*num_temporal_layer*/) override {}
 
   unsigned int GetMismatchFrames() const { return mismatch_nframes_; }
   unsigned int GetNonRefFrames() const { return num_nonref_frames_; }
@@ -129,14 +129,14 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   }
 
  protected:
-  virtual ~SyncFrameOnePassCbrSvc() {}
+  ~SyncFrameOnePassCbrSvc() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     speed_setting_ = 7;
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     return current_video_frame_ >= frame_to_start_decode_;
   }
 
@@ -225,8 +225,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     current_video_frame_ = video->frame();
     PreEncodeFrameHookSetup(video, encoder);
     if (video->frame() == 0) {
@@ -265,8 +265,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   }
 
 #if CONFIG_VP9_DECODER
-  virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Decoder *decoder) override {
     if (video->frame() < frame_to_sync_) {
       if (decode_to_layer_before_sync_ >= 0)
         decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
@@ -284,7 +284,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   }
 #endif
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
     // for TL > 0.
@@ -307,8 +307,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
     }
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_;
   }
 
@@ -331,7 +331,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   vpx_svc_ref_frame_config_t ref_frame_config_;
 
  private:
-  virtual void SetConfig(const int num_temporal_layer) {
+  void SetConfig(const int num_temporal_layer) override {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
     cfg_.rc_buf_sz = 1000;
@@ -657,15 +657,15 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
   }
 
  protected:
-  virtual ~LoopfilterOnePassCbrSvc() {}
+  ~LoopfilterOnePassCbrSvc() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     speed_setting_ = 7;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     PreEncodeFrameHookSetup(video, encoder);
     if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) {
       // Consider 3 cases:
@@ -694,7 +694,7 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
     // for TL > 0.
@@ -704,12 +704,12 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
       num_nonref_frames_++;
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     ++mismatch_nframes_;
   }
 
-  virtual void SetConfig(const int /*num_temporal_layer*/) {}
+  void SetConfig(const int /*num_temporal_layer*/) override {}
 
   int GetMismatchFrames() const { return mismatch_nframes_; }
   int GetNonRefFrames() const { return num_nonref_frames_; }
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index ca990f4dd4..ee552113ce 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -48,7 +48,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 #endif
   }
 
-  virtual ~TestVectorTest() {
+  ~TestVectorTest() override {
     if (md5_file_) fclose(md5_file_);
   }
 
@@ -59,9 +59,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
   }
 
 #if CONFIG_VP9_DECODER
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (video.frame_number() == 0 && mt_mode_ >= 0) {
       if (mt_mode_ == 1) {
         decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1);
@@ -77,8 +76,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
   }
 #endif
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index d92c13f88e..dab6e531b7 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -36,18 +36,18 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);
   }
 
-  virtual ~TileIndependenceTest() {
+  ~TileIndependenceTest() override {
     delete fw_dec_;
     delete inv_dec_;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_);
     }
@@ -65,7 +65,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     md5->Add(img);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
     UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
   }
diff --git a/test/timestamp_test.cc b/test/timestamp_test.cc
index 645a9f2ff8..d6f0b3bda2 100644
--- a/test/timestamp_test.cc
+++ b/test/timestamp_test.cc
@@ -42,16 +42,16 @@ class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource {
            (static_cast<double>(framerate_numerator_) / framerate_denominator_);
   }
 
-  virtual vpx_codec_pts_t pts() const {
+  vpx_codec_pts_t pts() const override {
     return static_cast<vpx_codec_pts_t>(frame_ * FrameDuration() +
                                         starting_pts_ + 0.5);
   }
 
-  virtual unsigned long duration() const {
+  unsigned long duration() const override {
     return static_cast<unsigned long>(FrameDuration() + 0.5);
   }
 
-  virtual vpx_rational_t timebase() const { return timebase_; }
+  vpx_rational_t timebase() const override { return timebase_; }
 
   void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
 
@@ -67,9 +67,9 @@ class TimestampTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   TimestampTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~TimestampTest() {}
+  ~TimestampTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
diff --git a/test/variance_test.cc b/test/variance_test.cc
index df9a1c56f6..6885252b82 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -210,7 +210,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
 
-  virtual ~SumOfSquaresTest() { libvpx_test::ClearSystemState(); }
+  ~SumOfSquaresTest() override { libvpx_test::ClearSystemState(); }
 
  protected:
   void ConstTest();
@@ -289,7 +289,7 @@ template <typename FunctionType>
 class MainTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -308,7 +308,7 @@ class MainTestClass
 #endif
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
@@ -568,7 +568,7 @@ template <typename FunctionType>
 class SubpelVarianceTest
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -592,7 +592,7 @@ class SubpelVarianceTest
     ASSERT_NE(ref_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (!use_high_bit_depth()) {
       vpx_free(src_);
       vpx_free(sec_);
diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc
index 64a861d15e..c91c2a0d26 100644
--- a/test/vp8_datarate_test.cc
+++ b/test/vp8_datarate_test.cc
@@ -24,10 +24,10 @@ class DatarateTestLarge
  public:
   DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~DatarateTestLarge() {}
+  ~DatarateTestLarge() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
@@ -47,8 +47,8 @@ class DatarateTestLarge
     use_roi_ = false;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
@@ -74,7 +74,7 @@ class DatarateTestLarge
     duration_ = 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
@@ -121,7 +121,7 @@ class DatarateTestLarge
     ++frame_number_;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     if (bits_total_) {
       const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
 
@@ -301,7 +301,7 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
 
 class DatarateTestRealTime : public DatarateTestLarge {
  public:
-  virtual ~DatarateTestRealTime() {}
+  ~DatarateTestRealTime() override {}
 };
 
 #if CONFIG_TEMPORAL_DENOISING
diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc
index 8cb84ddd8e..a6d414e508 100644
--- a/test/vp8_denoiser_sse2_test.cc
+++ b/test/vp8_denoiser_sse2_test.cc
@@ -30,11 +30,11 @@ namespace {
 const int kNumPixels = 16 * 16;
 class VP8DenoiserTest : public ::testing::TestWithParam<int> {
  public:
-  virtual ~VP8DenoiserTest() {}
+  ~VP8DenoiserTest() override {}
 
-  virtual void SetUp() { increase_denoising_ = GetParam(); }
+  void SetUp() override { increase_denoising_ = GetParam(); }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int increase_denoising_;
diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index 1b73a72a01..66d5c151c5 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -74,7 +74,7 @@ using libvpx_test::ACMRandom;
 
 class FdctTest : public ::testing::TestWithParam<FdctFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     fdct_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc
index 6e5baf229d..bb527966c5 100644
--- a/test/vp8_fragments_test.cc
+++ b/test/vp8_fragments_test.cc
@@ -17,9 +17,9 @@ class VP8FragmentsTest : public ::libvpx_test::EncoderTest,
                          public ::testing::Test {
  protected:
   VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
-  virtual ~VP8FragmentsTest() {}
+  ~VP8FragmentsTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     const unsigned long init_flags =  // NOLINT(runtime/int)
         VPX_CODEC_USE_OUTPUT_PARTITION;
     InitializeConfig();
diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index b76bcae11d..70d73c52d6 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -53,10 +53,10 @@ class Vp8RcInterfaceTest
  public:
   Vp8RcInterfaceTest()
       : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {}
-  virtual ~Vp8RcInterfaceTest() {}
+  ~Vp8RcInterfaceTest() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
@@ -111,8 +111,8 @@ class Vp8RcInterfaceTest
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (rc_cfg_.ts_number_layers > 1) {
       const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
       const int frame_flags =
@@ -139,7 +139,7 @@ class Vp8RcInterfaceTest
     encoder_exit_ = video->frame() == test_video_.frames;
   }
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     if (encoder_exit_) {
       return;
     }
@@ -149,7 +149,7 @@ class Vp8RcInterfaceTest
     ASSERT_EQ(rc_api_->GetQP(), qp);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
   }
 
diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index c7e6f1af02..7cc9a28396 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc
@@ -86,9 +86,9 @@ class ArfFreqTest
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
 
-  virtual ~ArfFreqTest() {}
+  ~ArfFreqTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(test_encode_param_.mode);
     if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
@@ -104,7 +104,7 @@ class ArfFreqTest
     dec_cfg_.threads = 4;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     min_run_ = ARF_NOT_SEEN;
     run_of_visible_frames_ = 0;
   }
@@ -126,7 +126,7 @@ class ArfFreqTest
     return frames;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
     const int frames = GetNumFramesInPkt(pkt);
     if (frames == 1) {
@@ -145,8 +145,8 @@ class ArfFreqTest
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc
index 9e9595ebe9..4ff1838c34 100644
--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@@ -53,14 +53,14 @@ int64_t BlockError8BitWrapper(const tran_low_t *coeff,
 
 class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
  public:
-  virtual ~BlockErrorTest() {}
-  virtual void SetUp() {
+  ~BlockErrorTest() override {}
+  void SetUp() override {
     error_block_op_ = GET_PARAM(0);
     ref_error_block_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   vpx_bit_depth_t bit_depth_;
diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index 7e91807492..48480d6fa1 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -28,7 +28,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
   }
 
  protected:
-  virtual ~DatarateTestVP9() {}
+  ~DatarateTestVP9() override {}
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -113,8 +113,8 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
@@ -164,7 +164,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     duration_ = 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
@@ -202,7 +202,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
          ++layer) {
       duration_ = (last_pts_ + 1) * timebase_;
@@ -243,7 +243,7 @@ class DatarateTestVP9RealTimeMultiBR
   DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -259,7 +259,7 @@ class DatarateTestVP9LargeVBR
   DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -579,10 +579,10 @@ class DatarateTestVP9RealTime : public DatarateTestVP9,
                                 public ::libvpx_test::CodecTestWithParam<int> {
  public:
   DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {}
-  virtual ~DatarateTestVP9RealTime() {}
+  ~DatarateTestVP9RealTime() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -731,10 +731,10 @@ class DatarateTestVP9RealTimeDeltaQUV
       public ::libvpx_test::CodecTestWith2Params<int, int> {
  public:
   DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {}
-  virtual ~DatarateTestVP9RealTimeDeltaQUV() {}
+  ~DatarateTestVP9RealTimeDeltaQUV() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -779,7 +779,7 @@ class DatarateTestVP9PostEncodeDrop
   DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -819,17 +819,17 @@ class DatarateTestVP9FrameQp
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
  public:
   DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {}
-  virtual ~DatarateTestVP9FrameQp() {}
+  ~DatarateTestVP9FrameQp() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     ResetModel();
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     set_cpu_used_ = 7;
     DatarateTestVP9::PreEncodeFrameHook(video, encoder);
     frame_qp_ = static_cast<int>(rnd_.RandRange(64));
@@ -837,7 +837,7 @@ class DatarateTestVP9FrameQp
     frame_++;
   }
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     int qp = 0;
     vpx_svc_layer_id_t layer_id;
     if (frame_ >= total_frame_) return;
@@ -847,8 +847,8 @@ class DatarateTestVP9FrameQp
     temporal_layer_id_ = layer_id.temporal_layer_id;
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     if (frame_ >= total_frame_) return;
     ASSERT_TRUE(cfg_.temporal_layering_mode ==
                     VP9E_TEMPORAL_LAYERING_MODE_0212 &&
@@ -945,7 +945,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
 // Params: speed setting.
 class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
  public:
-  virtual ~DatarateTestVP9RealTimeDenoiser() {}
+  ~DatarateTestVP9RealTimeDenoiser() override {}
 };
 
 // Check basic datarate targeting, for a single bitrate, when denoiser is on.
diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc
index d884b7eb92..1f679d5bd8 100644
--- a/test/vp9_denoiser_test.cc
+++ b/test/vp9_denoiser_test.cc
@@ -42,11 +42,11 @@ class VP9DenoiserTest
     : public ::testing::Test,
       public ::testing::WithParamInterface<VP9DenoiserTestParam> {
  public:
-  virtual ~VP9DenoiserTest() {}
+  ~VP9DenoiserTest() override {}
 
-  virtual void SetUp() { bs_ = GET_PARAM(1); }
+  void SetUp() override { bs_ = GET_PARAM(1); }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   BLOCK_SIZE bs_;
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
index ce2198c592..1f9929f880 100644
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -62,9 +62,9 @@ class VpxEncoderParmsGetToDecoder
   VpxEncoderParmsGetToDecoder()
       : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
 
-  virtual ~VpxEncoderParmsGetToDecoder() {}
+  ~VpxEncoderParmsGetToDecoder() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 25;
@@ -74,8 +74,8 @@ class VpxEncoderParmsGetToDecoder
     cfg_.rc_target_bitrate = test_video_.bitrate;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
       encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
@@ -95,9 +95,9 @@ class VpxEncoderParmsGetToDecoder
     }
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
     vpx_codec_alg_priv_t *const priv =
         reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc
index 7a85db26a4..d4c0b0dd11 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -89,9 +89,9 @@ class EndToEndTestAdaptiveRDThresh
       : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)),
         cpu_used_end_(GET_PARAM(2)) {}
 
-  virtual ~EndToEndTestAdaptiveRDThresh() {}
+  ~EndToEndTestAdaptiveRDThresh() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_lag_in_frames = 0;
@@ -102,8 +102,8 @@ class EndToEndTestAdaptiveRDThresh
     dec_cfg_.threads = 4;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_);
       encoder->Control(VP9E_SET_ROW_MT, 1);
@@ -131,9 +131,9 @@ class EndToEndTestLarge
     denoiser_on_ = 0;
   }
 
-  virtual ~EndToEndTestLarge() {}
+  ~EndToEndTestLarge() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -149,18 +149,18 @@ class EndToEndTestLarge
     dec_cfg_.threads = 4;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
@@ -207,9 +207,9 @@ class EndToEndTestLoopFilterThreading
   EndToEndTestLoopFilterThreading()
       : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {}
 
-  virtual ~EndToEndTestLoopFilterThreading() {}
+  ~EndToEndTestLoopFilterThreading() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_threads = 2;
@@ -221,16 +221,16 @@ class EndToEndTestLoopFilterThreading
     dec_cfg_.threads = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 8);
     }
     encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5);
   }
 
-  virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Decoder *decoder) override {
     if (video->frame() == 0) {
       decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0);
     }
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 54fa6c48e2..27a73b2aa5 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -44,9 +44,9 @@ class VPxFirstPassEncoderThreadTest
     firstpass_stats_.buf = nullptr;
     firstpass_stats_.sz = 0;
   }
-  virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+  ~VPxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -57,19 +57,19 @@ class VPxFirstPassEncoderThreadTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
     abort_ = false;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     // For first pass stats test, only run first pass encoder.
     if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS)
       abort_ |= first_pass_only_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                          ::libvpx_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       // Encode in 2-pass mode.
       encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
@@ -87,7 +87,7 @@ class VPxFirstPassEncoderThreadTest
     }
   }
 
-  virtual void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     const uint8_t *const pkt_buf =
         reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
     const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -233,9 +233,9 @@ class VPxEncoderThreadTest
     psnr_ = 0.0;
     nframes_ = 0;
   }
-  virtual ~VPxEncoderThreadTest() {}
+  ~VPxEncoderThreadTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -252,14 +252,14 @@ class VPxEncoderThreadTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                          ::libvpx_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       // Encode 4 column tiles.
       encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
@@ -280,21 +280,21 @@ class VPxEncoderThreadTest
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t /*pts*/) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t /*pts*/) override {
     ::libvpx_test::MD5 md5_res;
     md5_res.Add(&img);
     md5_.push_back(md5_res.Get());
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder * /*decoder*/) {
+  bool HandleDecodeResult(const vpx_codec_err_t res,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder * /*decoder*/) override {
     if (res != VPX_CODEC_OK) {
       EXPECT_EQ(VPX_CODEC_OK, res);
       return false;
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 6de7cf8d0f..daaf768699 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -96,7 +96,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
     stride_ = params_.block_size * 3;
     mask_ = (1 << params_.bit_depth) - 1;
diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc
index 931ac30a36..01dae08f54 100644
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -29,15 +29,15 @@ class LosslessTest
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)) {}
 
-  virtual ~LosslessTest() {}
+  ~LosslessTest() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       // Only call Control if quantizer > 0 to verify that using quantizer
       // alone will activate lossless
@@ -47,12 +47,12 @@ class LosslessTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
   }
 
diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc
index 6b1082a106..033c5fcd83 100644
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@@ -42,9 +42,9 @@ class MotionVectorTestLarge
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
 
-  virtual ~MotionVectorTestLarge() {}
+  ~MotionVectorTestLarge() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -59,8 +59,8 @@ class MotionVectorTestLarge
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 5e3a7c2701..5ba90a21bc 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -174,7 +174,7 @@ class VP9QuantizeBase : public AbstractBench {
     q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
   }
 
-  ~VP9QuantizeBase() {
+  ~VP9QuantizeBase() override {
     vpx_free(mb_plane_);
     vpx_free(zbin_ptr_);
     vpx_free(round_fp_ptr_);
@@ -225,7 +225,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
         quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
 
  protected:
-  virtual void Run();
+  void Run() override;
   void Speed(bool is_median);
   const QuantizeFunc quantize_op_;
   const QuantizeFunc ref_quantize_op_;
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 8422df074b..0680ac7df3 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -40,16 +40,16 @@ class RcInterfaceTest
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
         encoder_exit_(false) {}
 
-  virtual ~RcInterfaceTest() {}
+  ~RcInterfaceTest() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
@@ -69,7 +69,7 @@ class RcInterfaceTest
     encoder_exit_ = video->frame() == kNumFrames;
   }
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     if (encoder_exit_) {
       return;
     }
@@ -81,7 +81,7 @@ class RcInterfaceTest
     ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_);
   }
 
@@ -170,16 +170,16 @@ class RcInterfaceSvcTest
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
         dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)),
         parallel_spatial_layers_(false) {}
-  virtual ~RcInterfaceSvcTest() {}
+  ~RcInterfaceSvcTest() override {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
@@ -256,7 +256,7 @@ class RcInterfaceSvcTest
             : libvpx::RcFrameType::kInterFrame;
   }
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
     for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
     std::vector<int> rc_qp;
@@ -301,8 +301,8 @@ class RcInterfaceSvcTest
   // This method needs to be overridden because non-reference frames are
   // expected to be mismatched frames as the encoder will avoid loopfilter on
   // these frames.
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {}
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {}
 
   void RunSvc() {
     SetRCConfigSvc(3, 3);
diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc
index e8373c4c0b..a9347fb365 100644
--- a/test/vp9_roi_test.cc
+++ b/test/vp9_roi_test.cc
@@ -84,9 +84,9 @@ class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest,
                               public ::testing::Test {
  protected:
   RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {}
-  virtual ~RoiMaskBackgroundSkip() { free(roi_.roi_map); }
+  ~RoiMaskBackgroundSkip() override { free(roi_.roi_map); }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     SetRoi();
@@ -114,8 +114,8 @@ class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest,
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP9E_SET_AQ_MODE, 3);
diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc
index 2d1203fb89..bd45c557ee 100644
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@@ -33,10 +33,10 @@ typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src,
 class ScaleTest : public VpxScaleBase,
                   public ::testing::TestWithParam<ScaleFrameFunc> {
  public:
-  virtual ~ScaleTest() {}
+  ~ScaleTest() override {}
 
  protected:
-  virtual void SetUp() { scale_fn_ = GetParam(); }
+  void SetUp() override { scale_fn_ = GetParam(); }
 
   void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
     vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler);
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index a57082f1eb..78deb51909 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -34,10 +34,10 @@ namespace vp9 {
 class VP9SubtractBlockTest : public AbstractBench,
                              public ::testing::TestWithParam<SubtractFunc> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run() {
+  void Run() override {
     GetParam()(block_height_, block_width_, diff_, block_width_, src_,
                block_width_, pred_, block_width_);
   }
@@ -176,7 +176,7 @@ using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>;
 
 class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)];
     block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)];
     bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1));
@@ -198,7 +198,7 @@ class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
     ASSERT_NE(diff_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(CONVERT_TO_SHORTPTR(src_));
     vpx_free(CONVERT_TO_SHORTPTR(pred_));
     vpx_free(diff_);
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 1ceef8185c..3409e72dfd 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -26,10 +26,10 @@ using std::string;
 
 class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> {
  protected:
-  virtual ~VPxWorkerThreadTest() {}
-  virtual void SetUp() { vpx_get_worker_interface()->init(&worker_); }
+  ~VPxWorkerThreadTest() override {}
+  void SetUp() override { vpx_get_worker_interface()->init(&worker_); }
 
-  virtual void TearDown() { vpx_get_worker_interface()->end(&worker_); }
+  void TearDown() override { vpx_get_worker_interface()->end(&worker_); }
 
   void Run(VPxWorker *worker) {
     const bool synchronous = GetParam();
diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc
index 7eea437fc8..3a238b7a8d 100644
--- a/test/vpx_scale_test.cc
+++ b/test/vpx_scale_test.cc
@@ -38,10 +38,10 @@ class ExtendBorderTest
     : public VpxScaleBase,
       public ::testing::TestWithParam<ExtendFrameBorderFunc> {
  public:
-  virtual ~ExtendBorderTest() {}
+  ~ExtendBorderTest() override {}
 
  protected:
-  virtual void SetUp() { extend_fn_ = GetParam(); }
+  void SetUp() override { extend_fn_ = GetParam(); }
 
   void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); }
 
@@ -68,10 +68,10 @@ INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest,
 class CopyFrameTest : public VpxScaleBase,
                       public ::testing::TestWithParam<CopyFrameFunc> {
  public:
-  virtual ~CopyFrameTest() {}
+  ~CopyFrameTest() override {}
 
  protected:
-  virtual void SetUp() { copy_frame_fn_ = GetParam(); }
+  void SetUp() override { copy_frame_fn_ = GetParam(); }
 
   void CopyFrame() {
     ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_));
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 32f2cd51d3..78a944fd08 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -78,7 +78,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
  protected:
   Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
 
-  virtual ~Y4mVideoSourceTest() { CloseSource(); }
+  ~Y4mVideoSourceTest() override { CloseSource(); }
 
   virtual void Init(const std::string &file_name, int limit) {
     file_name_ = file_name;
@@ -140,7 +140,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
  protected:
   Y4mVideoWriteTest() : tmpfile_(nullptr) {}
 
-  virtual ~Y4mVideoWriteTest() {
+  ~Y4mVideoWriteTest() override {
     delete tmpfile_;
     input_file_ = nullptr;
   }
@@ -172,7 +172,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
     ReplaceInputFile(tmpfile_->file());
   }
 
-  virtual void Init(const std::string &file_name, int limit) {
+  void Init(const std::string &file_name, int limit) override {
     Y4mVideoSourceTest::Init(file_name, limit);
     WriteY4mAndReadBack();
   }
diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc
index 91b4e804b3..0677d55688 100644
--- a/test/yuv_temporal_filter_test.cc
+++ b/test/yuv_temporal_filter_test.cc
@@ -290,7 +290,7 @@ void ApplyReferenceFilter(
 class YUVTemporalFilterTest
     : public ::testing::TestWithParam<TemporalFilterWithBd> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     filter_func_ = GetParam().temporal_filter;
     bd_ = GetParam().bd;
     use_highbd_ = (bd_ != 8);

From 1c2297b2bc693a8d3d6a11abaff801c045f11b54 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 25 Jul 2023 12:04:57 -0700
Subject: [PATCH 769/926] test/*.cc: use '= default'

created with clang-tidy --fix --checks=-*,modernize-use-equals-default

Change-Id: Ie373fb5501491fce53479d20f3a6d908c4b7c535
---
 test/active_map_refresh_test.cc          |  2 +-
 test/active_map_test.cc                  |  2 +-
 test/add_noise_test.cc                   |  2 +-
 test/alt_ref_aq_segment_test.cc          |  2 +-
 test/altref_test.cc                      |  4 ++--
 test/aq_segment_test.cc                  |  2 +-
 test/borders_test.cc                     |  2 +-
 test/config_test.cc                      |  2 +-
 test/cpu_speed_test.cc                   |  2 +-
 test/cq_test.cc                          |  2 +-
 test/dct16x16_test.cc                    |  8 ++++----
 test/dct32x32_test.cc                    |  4 ++--
 test/decode_corrupted.cc                 |  2 +-
 test/decode_perf_test.cc                 |  2 +-
 test/decode_svc_test.cc                  |  2 +-
 test/encode_api_test.cc                  |  2 +-
 test/encode_perf_test.cc                 |  2 +-
 test/error_resilience_test.cc            |  4 ++--
 test/fdct8x8_test.cc                     |  8 ++++----
 test/frame_size_tests.cc                 |  2 +-
 test/keyframe_test.cc                    |  2 +-
 test/level_test.cc                       |  2 +-
 test/lpf_test.cc                         |  4 ++--
 test/partial_idct_test.cc                |  2 +-
 test/realtime_test.cc                    |  2 +-
 test/resize_test.cc                      | 12 ++++++------
 test/sum_squares_test.cc                 |  2 +-
 test/superframe_test.cc                  |  2 +-
 test/svc_datarate_test.cc                | 16 ++++++++--------
 test/svc_end_to_end_test.cc              |  6 +++---
 test/timestamp_test.cc                   |  2 +-
 test/vp8_datarate_test.cc                |  4 ++--
 test/vp8_denoiser_sse2_test.cc           |  2 +-
 test/vp8_fragments_test.cc               |  2 +-
 test/vp8_ratectrl_rtc_test.cc            |  4 ++--
 test/vp9_arf_freq_test.cc                |  2 +-
 test/vp9_block_error_test.cc             |  2 +-
 test/vp9_datarate_test.cc                | 10 +++++-----
 test/vp9_denoiser_test.cc                |  2 +-
 test/vp9_encoder_parms_get_to_decoder.cc |  2 +-
 test/vp9_end_to_end_test.cc              |  6 +++---
 test/vp9_ethread_test.cc                 |  2 +-
 test/vp9_lossless_test.cc                |  2 +-
 test/vp9_motion_vector_test.cc           |  2 +-
 test/vp9_ratectrl_rtc_test.cc            |  4 ++--
 test/vp9_scale_test.cc                   |  2 +-
 test/vp9_thread_test.cc                  |  2 +-
 test/vpx_scale_test.cc                   |  4 ++--
 48 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc
index 8b35ca81ba..ad067346a7 100644
--- a/test/active_map_refresh_test.cc
+++ b/test/active_map_refresh_test.cc
@@ -62,7 +62,7 @@ class ActiveMapRefreshTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
-  ~ActiveMapRefreshTest() override {}
+  ~ActiveMapRefreshTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 1f661b559c..d222c00b74 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -26,7 +26,7 @@ class ActiveMapTest
   static const int kHeight = 144;
 
   ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
-  ~ActiveMapTest() override {}
+  ~ActiveMapTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
index 6e787dd6ba..4fc4e81e63 100644
--- a/test/add_noise_test.cc
+++ b/test/add_noise_test.cc
@@ -33,7 +33,7 @@ class AddNoiseTest : public ::testing::Test,
                      public ::testing::WithParamInterface<AddNoiseTestFPParam> {
  public:
   void TearDown() override { libvpx_test::ClearSystemState(); }
-  ~AddNoiseTest() override {}
+  ~AddNoiseTest() override = default;
 };
 
 double stddev6(char a, char b, char c, char d, char e, char f) {
diff --git a/test/alt_ref_aq_segment_test.cc b/test/alt_ref_aq_segment_test.cc
index b64fc3cd0b..3b1a26ed16 100644
--- a/test/alt_ref_aq_segment_test.cc
+++ b/test/alt_ref_aq_segment_test.cc
@@ -20,7 +20,7 @@ class AltRefAqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  ~AltRefAqSegmentTest() override {}
+  ~AltRefAqSegmentTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/altref_test.cc b/test/altref_test.cc
index 69b2b87a2a..903230fde9 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -24,7 +24,7 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
                    public ::libvpx_test::CodecTestWithParam<int> {
  protected:
   AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {}
-  ~AltRefTest() override {}
+  ~AltRefTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
@@ -75,7 +75,7 @@ class AltRefForcedKeyTestLarge
   AltRefForcedKeyTestLarge()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
-  ~AltRefForcedKeyTestLarge() override {}
+  ~AltRefForcedKeyTestLarge() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index c98b8de094..955e1dafc0 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -20,7 +20,7 @@ class AqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  ~AqSegmentTest() override {}
+  ~AqSegmentTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/borders_test.cc b/test/borders_test.cc
index 009121bf22..2726bd557d 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -22,7 +22,7 @@ class BordersTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   BordersTest() : EncoderTest(GET_PARAM(0)) {}
-  ~BordersTest() override {}
+  ~BordersTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/config_test.cc b/test/config_test.cc
index a476d580a5..729b01151b 100644
--- a/test/config_test.cc
+++ b/test/config_test.cc
@@ -22,7 +22,7 @@ class ConfigTest
   ConfigTest()
       : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0),
         frame_count_max_(0) {}
-  ~ConfigTest() override {}
+  ~ConfigTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 78999ce658..22f4552963 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -26,7 +26,7 @@ class CpuSpeedTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
         tune_content_(VP9E_CONTENT_DEFAULT) {}
-  ~CpuSpeedTest() override {}
+  ~CpuSpeedTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/cq_test.cc b/test/cq_test.cc
index a9a16aae13..b74915a336 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -50,7 +50,7 @@ class CQTest : public ::libvpx_test::EncoderTest,
     init_flags_ = VPX_CODEC_USE_PSNR;
   }
 
-  ~CQTest() override {}
+  ~CQTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index de98d99731..8c4213ee16 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -310,7 +310,7 @@ void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
 
 class Trans16x16TestBase {
  public:
-  virtual ~Trans16x16TestBase() {}
+  virtual ~Trans16x16TestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
@@ -728,7 +728,7 @@ class Trans16x16TestBase {
 class Trans16x16DCT : public Trans16x16TestBase,
                       public ::testing::TestWithParam<Dct16x16Param> {
  public:
-  ~Trans16x16DCT() override {}
+  ~Trans16x16DCT() override = default;
 
   void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
@@ -782,7 +782,7 @@ TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); }
 class Trans16x16HT : public Trans16x16TestBase,
                      public ::testing::TestWithParam<Ht16x16Param> {
  public:
-  ~Trans16x16HT() override {}
+  ~Trans16x16HT() override = default;
 
   void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
@@ -832,7 +832,7 @@ TEST_P(Trans16x16HT, QuantCheck) {
 class InvTrans16x16DCT : public Trans16x16TestBase,
                          public ::testing::TestWithParam<Idct16x16Param> {
  public:
-  ~InvTrans16x16DCT() override {}
+  ~InvTrans16x16DCT() override = default;
 
   void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 62547ac537..6233b17a43 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -89,7 +89,7 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
 class Trans32x32Test : public AbstractBench,
                        public ::testing::TestWithParam<Trans32x32Param> {
  public:
-  ~Trans32x32Test() override {}
+  ~Trans32x32Test() override = default;
   void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
@@ -321,7 +321,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
 
 class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> {
  public:
-  ~InvTrans32x32Test() override {}
+  ~InvTrans32x32Test() override = default;
   void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc
index a9a2cc6e70..58773d7b86 100644
--- a/test/decode_corrupted.cc
+++ b/test/decode_corrupted.cc
@@ -28,7 +28,7 @@ class DecodeCorruptedFrameTest
   DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {}
 
  protected:
-  ~DecodeCorruptedFrameTest() override {}
+  ~DecodeCorruptedFrameTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 7533778e82..ed23cc2e7e 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -118,7 +118,7 @@ class VP9NewEncodeDecodePerfTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
         outfile_(0), out_frames_(0) {}
 
-  ~VP9NewEncodeDecodePerfTest() override {}
+  ~VP9NewEncodeDecodePerfTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc
index 29e9bd06f5..7098e7b270 100644
--- a/test/decode_svc_test.cc
+++ b/test/decode_svc_test.cc
@@ -25,7 +25,7 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest,
                       public ::libvpx_test::CodecTestWithParam<const char *> {
  protected:
   DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {}
-  ~DecodeSvcTest() override {}
+  ~DecodeSvcTest() override = default;
 
   void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
                           libvpx_test::Decoder *decoder) override {
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index e8a044ae17..c8bd7daa4a 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -370,7 +370,7 @@ class EncodeApiGetTplStatsTest
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
  public:
   EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {}
-  ~EncodeApiGetTplStatsTest() override {}
+  ~EncodeApiGetTplStatsTest() override = default;
 
  protected:
   void SetUp() override {
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 5f9c58dc94..171ff8eeca 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -61,7 +61,7 @@ class VP9EncodePerfTest
       : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
 
-  ~VP9EncodePerfTest() override {}
+  ~VP9EncodePerfTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 622c3c4461..8db4685257 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -30,7 +30,7 @@ class ErrorResilienceTestLarge
     Reset();
   }
 
-  ~ErrorResilienceTestLarge() override {}
+  ~ErrorResilienceTestLarge() override = default;
 
   void Reset() {
     error_nframes_ = 0;
@@ -381,7 +381,7 @@ class ErrorResilienceTestLargeCodecControls
     Reset();
   }
 
-  ~ErrorResilienceTestLargeCodecControls() override {}
+  ~ErrorResilienceTestLargeCodecControls() override = default;
 
   void Reset() {
     last_pts_ = 0;
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index ba9db00120..3cdf909d46 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -143,7 +143,7 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
 #endif
 class FwdTrans8x8TestBase {
  public:
-  virtual ~FwdTrans8x8TestBase() {}
+  virtual ~FwdTrans8x8TestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
@@ -539,7 +539,7 @@ class FwdTrans8x8TestBase {
 class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Dct8x8Param> {
  public:
-  ~FwdTrans8x8DCT() override {}
+  ~FwdTrans8x8DCT() override = default;
 
   void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
@@ -578,7 +578,7 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 class FwdTrans8x8HT : public FwdTrans8x8TestBase,
                       public ::testing::TestWithParam<Ht8x8Param> {
  public:
-  ~FwdTrans8x8HT() override {}
+  ~FwdTrans8x8HT() override = default;
 
   void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
@@ -614,7 +614,7 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); }
 class InvTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Idct8x8Param> {
  public:
-  ~InvTrans8x8DCT() override {}
+  ~InvTrans8x8DCT() override = default;
 
   void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 266858eebb..7b6c29a88f 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -79,7 +79,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
  protected:
   VP9FrameSizeTestsLarge()
       : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {}
-  ~VP9FrameSizeTestsLarge() override {}
+  ~VP9FrameSizeTestsLarge() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc
index d624cb19d6..dabf88e415 100644
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -22,7 +22,7 @@ class KeyframeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   KeyframeTest() : EncoderTest(GET_PARAM(0)) {}
-  ~KeyframeTest() override {}
+  ~KeyframeTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/level_test.cc b/test/level_test.cc
index 8e653d93e1..3f1cf9f1c5 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -22,7 +22,7 @@ class LevelTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0),
         level_(0) {}
-  ~LevelTest() override {}
+  ~LevelTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 2f04194dce..ce0ddeae18 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -129,7 +129,7 @@ uint8_t GetHevThresh(ACMRandom *rnd) {
 
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
-  ~Loop8Test6Param() override {}
+  ~Loop8Test6Param() override = default;
   void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
@@ -151,7 +151,7 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param);
     (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH)
 class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
  public:
-  ~Loop8Test9Param() override {}
+  ~Loop8Test9Param() override = default;
   void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 6593ac68e9..01e63eb691 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -59,7 +59,7 @@ const int kCountTestBlock = 1000;
 
 class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
-  ~PartialIDctTest() override {}
+  ~PartialIDctTest() override = default;
   void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     fwd_txfm_ = GET_PARAM(0);
diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index c5de2dcb35..88e510fd0d 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -26,7 +26,7 @@ class RealtimeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
-  ~RealtimeTest() override {}
+  ~RealtimeTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 3eb842f549..7d01bbd3d5 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -247,7 +247,7 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
   }
   bool flag_codec_;
   bool smaller_width_larger_size_;
-  ~ResizingVideoSource() override {}
+  ~ResizingVideoSource() override = default;
 
  protected:
   void Next() override {
@@ -267,7 +267,7 @@ class ResizeTest
  protected:
   ResizeTest() : EncoderTest(GET_PARAM(0)) {}
 
-  ~ResizeTest() override {}
+  ~ResizeTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
@@ -336,7 +336,7 @@ class ResizeInternalTest : public ResizeTest {
   ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  ~ResizeInternalTest() override {}
+  ~ResizeInternalTest() override = default;
 
   void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
@@ -450,7 +450,7 @@ class ResizeRealtimeTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
-  ~ResizeRealtimeTest() override {}
+  ~ResizeRealtimeTest() override = default;
 
   void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                           libvpx_test::Encoder *encoder) override {
@@ -693,7 +693,7 @@ class ResizeCspTest : public ResizeTest {
   ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  ~ResizeCspTest() override {}
+  ~ResizeCspTest() override = default;
 
   void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
@@ -758,7 +758,7 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource {
     limit_ = 30;
   }
 
-  ~ResizingCspVideoSource() override {}
+  ~ResizingCspVideoSource() override = default;
 
  protected:
   void Next() override {
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 1943acf8b3..5abb464dc0 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -33,7 +33,7 @@ typedef std::tuple<SSI16Func, SSI16Func> SumSquaresParam;
 
 class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
  public:
-  ~SumSquaresTest() override {}
+  ~SumSquaresTest() override = default;
   void SetUp() override {
     ref_func_ = GET_PARAM(0);
     tst_func_ = GET_PARAM(1);
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 4a522dd496..4c3aa1625a 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -28,7 +28,7 @@ class SuperframeTest
  protected:
   SuperframeTest()
       : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {}
-  ~SuperframeTest() override {}
+  ~SuperframeTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 723538368f..aff4ace843 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -43,7 +43,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   }
 
  protected:
-  ~DatarateOnePassCbrSvc() override {}
+  ~DatarateOnePassCbrSvc() override = default;
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -670,7 +670,7 @@ class DatarateOnePassCbrSvcSingleBR
   DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  ~DatarateOnePassCbrSvcSingleBR() override {}
+  ~DatarateOnePassCbrSvcSingleBR() override = default;
 
  protected:
   void SetUp() override {
@@ -1160,7 +1160,7 @@ class DatarateOnePassCbrSvcMultiBR
   DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  ~DatarateOnePassCbrSvcMultiBR() override {}
+  ~DatarateOnePassCbrSvcMultiBR() override = default;
 
  protected:
   void SetUp() override {
@@ -1243,7 +1243,7 @@ class DatarateOnePassCbrSvcFrameDropMultiBR
       : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  ~DatarateOnePassCbrSvcFrameDropMultiBR() override {}
+  ~DatarateOnePassCbrSvcFrameDropMultiBR() override = default;
 
  protected:
   void SetUp() override {
@@ -1355,7 +1355,7 @@ class DatarateOnePassCbrSvcInterLayerPredSingleBR
       : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override {}
+  ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override = default;
 
  protected:
   void SetUp() override {
@@ -1441,7 +1441,7 @@ class DatarateOnePassCbrSvcDenoiser
   DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  ~DatarateOnePassCbrSvcDenoiser() override {}
+  ~DatarateOnePassCbrSvcDenoiser() override = default;
 
  protected:
   void SetUp() override {
@@ -1499,7 +1499,7 @@ class DatarateOnePassCbrSvcSmallKF
   DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  ~DatarateOnePassCbrSvcSmallKF() override {}
+  ~DatarateOnePassCbrSvcSmallKF() override = default;
 
  protected:
   void SetUp() override {
@@ -1702,7 +1702,7 @@ class DatarateOnePassCbrSvcPostencodeDrop
   DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  ~DatarateOnePassCbrSvcPostencodeDrop() override {}
+  ~DatarateOnePassCbrSvcPostencodeDrop() override = default;
 
  protected:
   void SetUp() override {
diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index c9ef35bbe6..b4337ae754 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -45,7 +45,7 @@ class ScalePartitionOnePassCbrSvc
   }
 
  protected:
-  ~ScalePartitionOnePassCbrSvc() override {}
+  ~ScalePartitionOnePassCbrSvc() override = default;
 
   void SetUp() override {
     InitializeConfig();
@@ -129,7 +129,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   }
 
  protected:
-  ~SyncFrameOnePassCbrSvc() override {}
+  ~SyncFrameOnePassCbrSvc() override = default;
 
   void SetUp() override {
     InitializeConfig();
@@ -657,7 +657,7 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
   }
 
  protected:
-  ~LoopfilterOnePassCbrSvc() override {}
+  ~LoopfilterOnePassCbrSvc() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/timestamp_test.cc b/test/timestamp_test.cc
index d6f0b3bda2..00abf8f31c 100644
--- a/test/timestamp_test.cc
+++ b/test/timestamp_test.cc
@@ -67,7 +67,7 @@ class TimestampTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   TimestampTest() : EncoderTest(GET_PARAM(0)) {}
-  ~TimestampTest() override {}
+  ~TimestampTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc
index c91c2a0d26..aee27af66e 100644
--- a/test/vp8_datarate_test.cc
+++ b/test/vp8_datarate_test.cc
@@ -24,7 +24,7 @@ class DatarateTestLarge
  public:
   DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
 
-  ~DatarateTestLarge() override {}
+  ~DatarateTestLarge() override = default;
 
  protected:
   void SetUp() override {
@@ -301,7 +301,7 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
 
 class DatarateTestRealTime : public DatarateTestLarge {
  public:
-  ~DatarateTestRealTime() override {}
+  ~DatarateTestRealTime() override = default;
 };
 
 #if CONFIG_TEMPORAL_DENOISING
diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc
index a6d414e508..7fa867d8bb 100644
--- a/test/vp8_denoiser_sse2_test.cc
+++ b/test/vp8_denoiser_sse2_test.cc
@@ -30,7 +30,7 @@ namespace {
 const int kNumPixels = 16 * 16;
 class VP8DenoiserTest : public ::testing::TestWithParam<int> {
  public:
-  ~VP8DenoiserTest() override {}
+  ~VP8DenoiserTest() override = default;
 
   void SetUp() override { increase_denoising_ = GetParam(); }
 
diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc
index bb527966c5..01b4c2120e 100644
--- a/test/vp8_fragments_test.cc
+++ b/test/vp8_fragments_test.cc
@@ -17,7 +17,7 @@ class VP8FragmentsTest : public ::libvpx_test::EncoderTest,
                          public ::testing::Test {
  protected:
   VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
-  ~VP8FragmentsTest() override {}
+  ~VP8FragmentsTest() override = default;
 
   void SetUp() override {
     const unsigned long init_flags =  // NOLINT(runtime/int)
diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index 70d73c52d6..81f06d90ad 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -25,7 +25,7 @@
 namespace {
 
 struct Vp8RCTestVideo {
-  Vp8RCTestVideo() {}
+  Vp8RCTestVideo() = default;
   Vp8RCTestVideo(const char *name_, int width_, int height_,
                  unsigned int frames_)
       : name(name_), width(width_), height(height_), frames(frames_) {}
@@ -53,7 +53,7 @@ class Vp8RcInterfaceTest
  public:
   Vp8RcInterfaceTest()
       : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {}
-  ~Vp8RcInterfaceTest() override {}
+  ~Vp8RcInterfaceTest() override = default;
 
  protected:
   void SetUp() override {
diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index 7cc9a28396..3882326d2f 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc
@@ -86,7 +86,7 @@ class ArfFreqTest
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
 
-  ~ArfFreqTest() override {}
+  ~ArfFreqTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc
index 4ff1838c34..0645341ac1 100644
--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@@ -53,7 +53,7 @@ int64_t BlockError8BitWrapper(const tran_low_t *coeff,
 
 class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
  public:
-  ~BlockErrorTest() override {}
+  ~BlockErrorTest() override = default;
   void SetUp() override {
     error_block_op_ = GET_PARAM(0);
     ref_error_block_op_ = GET_PARAM(1);
diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index 48480d6fa1..4bc9099206 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -28,7 +28,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
   }
 
  protected:
-  ~DatarateTestVP9() override {}
+  ~DatarateTestVP9() override = default;
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -579,7 +579,7 @@ class DatarateTestVP9RealTime : public DatarateTestVP9,
                                 public ::libvpx_test::CodecTestWithParam<int> {
  public:
   DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {}
-  ~DatarateTestVP9RealTime() override {}
+  ~DatarateTestVP9RealTime() override = default;
 
  protected:
   void SetUp() override {
@@ -731,7 +731,7 @@ class DatarateTestVP9RealTimeDeltaQUV
       public ::libvpx_test::CodecTestWith2Params<int, int> {
  public:
   DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {}
-  ~DatarateTestVP9RealTimeDeltaQUV() override {}
+  ~DatarateTestVP9RealTimeDeltaQUV() override = default;
 
  protected:
   void SetUp() override {
@@ -819,7 +819,7 @@ class DatarateTestVP9FrameQp
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
  public:
   DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {}
-  ~DatarateTestVP9FrameQp() override {}
+  ~DatarateTestVP9FrameQp() override = default;
 
  protected:
   void SetUp() override {
@@ -945,7 +945,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
 // Params: speed setting.
 class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
  public:
-  ~DatarateTestVP9RealTimeDenoiser() override {}
+  ~DatarateTestVP9RealTimeDenoiser() override = default;
 };
 
 // Check basic datarate targeting, for a single bitrate, when denoiser is on.
diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc
index 1f679d5bd8..831f83305c 100644
--- a/test/vp9_denoiser_test.cc
+++ b/test/vp9_denoiser_test.cc
@@ -42,7 +42,7 @@ class VP9DenoiserTest
     : public ::testing::Test,
       public ::testing::WithParamInterface<VP9DenoiserTestParam> {
  public:
-  ~VP9DenoiserTest() override {}
+  ~VP9DenoiserTest() override = default;
 
   void SetUp() override { bs_ = GET_PARAM(1); }
 
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
index 1f9929f880..0e182c76db 100644
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -62,7 +62,7 @@ class VpxEncoderParmsGetToDecoder
   VpxEncoderParmsGetToDecoder()
       : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
 
-  ~VpxEncoderParmsGetToDecoder() override {}
+  ~VpxEncoderParmsGetToDecoder() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc
index d4c0b0dd11..79be4ee146 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -89,7 +89,7 @@ class EndToEndTestAdaptiveRDThresh
       : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)),
         cpu_used_end_(GET_PARAM(2)) {}
 
-  ~EndToEndTestAdaptiveRDThresh() override {}
+  ~EndToEndTestAdaptiveRDThresh() override = default;
 
   void SetUp() override {
     InitializeConfig();
@@ -131,7 +131,7 @@ class EndToEndTestLarge
     denoiser_on_ = 0;
   }
 
-  ~EndToEndTestLarge() override {}
+  ~EndToEndTestLarge() override = default;
 
   void SetUp() override {
     InitializeConfig();
@@ -207,7 +207,7 @@ class EndToEndTestLoopFilterThreading
   EndToEndTestLoopFilterThreading()
       : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {}
 
-  ~EndToEndTestLoopFilterThreading() override {}
+  ~EndToEndTestLoopFilterThreading() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 27a73b2aa5..c8d3cba7fb 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -233,7 +233,7 @@ class VPxEncoderThreadTest
     psnr_ = 0.0;
     nframes_ = 0;
   }
-  ~VPxEncoderThreadTest() override {}
+  ~VPxEncoderThreadTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc
index 01dae08f54..fe3cd1aba4 100644
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -29,7 +29,7 @@ class LosslessTest
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)) {}
 
-  ~LosslessTest() override {}
+  ~LosslessTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc
index 033c5fcd83..495ea11fce 100644
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@@ -42,7 +42,7 @@ class MotionVectorTestLarge
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
 
-  ~MotionVectorTestLarge() override {}
+  ~MotionVectorTestLarge() override = default;
 
   void SetUp() override {
     InitializeConfig();
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 0680ac7df3..313b68eda3 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -40,7 +40,7 @@ class RcInterfaceTest
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
         encoder_exit_(false) {}
 
-  ~RcInterfaceTest() override {}
+  ~RcInterfaceTest() override = default;
 
  protected:
   void SetUp() override {
@@ -170,7 +170,7 @@ class RcInterfaceSvcTest
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
         dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)),
         parallel_spatial_layers_(false) {}
-  ~RcInterfaceSvcTest() override {}
+  ~RcInterfaceSvcTest() override = default;
 
  protected:
   void SetUp() override {
diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc
index bd45c557ee..049a10a617 100644
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@@ -33,7 +33,7 @@ typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src,
 class ScaleTest : public VpxScaleBase,
                   public ::testing::TestWithParam<ScaleFrameFunc> {
  public:
-  ~ScaleTest() override {}
+  ~ScaleTest() override = default;
 
  protected:
   void SetUp() override { scale_fn_ = GetParam(); }
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 3409e72dfd..c0cea681d7 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -26,7 +26,7 @@ using std::string;
 
 class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> {
  protected:
-  ~VPxWorkerThreadTest() override {}
+  ~VPxWorkerThreadTest() override = default;
   void SetUp() override { vpx_get_worker_interface()->init(&worker_); }
 
   void TearDown() override { vpx_get_worker_interface()->end(&worker_); }
diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc
index 3a238b7a8d..3897a6088d 100644
--- a/test/vpx_scale_test.cc
+++ b/test/vpx_scale_test.cc
@@ -38,7 +38,7 @@ class ExtendBorderTest
     : public VpxScaleBase,
       public ::testing::TestWithParam<ExtendFrameBorderFunc> {
  public:
-  ~ExtendBorderTest() override {}
+  ~ExtendBorderTest() override = default;
 
  protected:
   void SetUp() override { extend_fn_ = GetParam(); }
@@ -68,7 +68,7 @@ INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest,
 class CopyFrameTest : public VpxScaleBase,
                       public ::testing::TestWithParam<CopyFrameFunc> {
  public:
-  ~CopyFrameTest() override {}
+  ~CopyFrameTest() override = default;
 
  protected:
   void SetUp() override { copy_frame_fn_ = GetParam(); }

From 5740cb39296ca23e7218e27c2114c972a3f3fa65 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 25 Jul 2023 12:10:21 -0700
Subject: [PATCH 770/926] test/decode_perf_test.cc: use nullptr

created with clang-tidy --fix --checks=-*,modernize-use-nullptr

Change-Id: Ibf4a80fa00e9b59d471c92788ec4c7c72e4662e5
---
 test/decode_perf_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index ed23cc2e7e..383fd2d896 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -116,7 +116,7 @@ class VP9NewEncodeDecodePerfTest
  protected:
   VP9NewEncodeDecodePerfTest()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
-        outfile_(0), out_frames_(0) {}
+        outfile_(nullptr), out_frames_(0) {}
 
   ~VP9NewEncodeDecodePerfTest() override = default;
 

From d62edaf41f8ffb3f1bc3e7f7b449c63258666d9c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 25 Jul 2023 12:18:03 -0700
Subject: [PATCH 771/926] test/**.cc: use bool literals

created with clang-tidy --fix --checks=-*,modernize-use-bool-literals

Change-Id: Ifaed8ca824676555acaf1053b2a5a52c51a70638
---
 test/error_resilience_test.cc | 4 ++--
 test/vp9_ratectrl_rtc_test.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 8db4685257..6b019b2bfb 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -136,11 +136,11 @@ class ErrorResilienceTestLarge
         if (error_frames_[i] == nframes_ - 1) {
           std::cout << "             Skipping decoding frame: "
                     << error_frames_[i] << "\n";
-          return 0;
+          return false;
         }
       }
     }
-    return 1;
+    return true;
   }
 
   void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 313b68eda3..b76fd3624c 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -227,7 +227,7 @@ class RcInterfaceSvcTest
         rc_cfg_.layer_target_bitrate[4] = 0;
         rc_cfg_.layer_target_bitrate[5] = 0;
         ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
-      } else if (/*DISABLES CODE*/ (0) && video->frame() == 280) {
+      } else if (/*DISABLES CODE*/ (false) && video->frame() == 280) {
         // TODO(marpan): Re-enable this going back up when issue is fixed.
         // Go back up to 3 spatial layers.
         // Update the encoder config: use the original bitrates.

From d899b979450c47c9ce4defad5508ed03af85d3cd Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 26 Jul 2023 15:38:36 -0700
Subject: [PATCH 772/926] encode_test_driver.h: use bool literal

Change-Id: If47be9ca0daa18d92cb849484f9e139e65e3560e
---
 test/encode_test_driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 165fcfabf6..c7974894c7 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -264,7 +264,7 @@ class EncoderTest {
 
   const CodecFactory *codec_;
   // Hook to determine whether to decode frame after encoding
-  virtual bool DoDecode() const { return 1; }
+  virtual bool DoDecode() const { return true; }
 
   // Hook to handle encode/decode mismatch
   virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2);

From 626e37e77717673004b983b8d5fe0836636247b0 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 26 Jul 2023 15:29:40 -0700
Subject: [PATCH 773/926] test/*.h: prefer 'override' to 'virtual'

created with clang-tidy --fix --checks=-*,modernize-use-override

Change-Id: I53412f35590799574edb573ae417a4a004cccd1e
---
 test/codec_factory.h     | 42 +++++++++++++++++++---------------------
 test/ivf_video_source.h  | 14 +++++++-------
 test/svc_test.h          |  6 +++---
 test/video_source.h      | 22 ++++++++++-----------
 test/webm_video_source.h | 14 +++++++-------
 test/y4m_video_source.h  | 18 ++++++++---------
 test/yuv_video_source.h  | 18 ++++++++---------
 7 files changed, 66 insertions(+), 68 deletions(-)

diff --git a/test/codec_factory.h b/test/codec_factory.h
index 96092610c6..d00563df1c 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -84,7 +84,7 @@ class VP8Decoder : public Decoder {
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP8_DECODER
     return &vpx_codec_vp8_dx_algo;
 #else
@@ -100,7 +100,7 @@ class VP8Encoder : public Encoder {
       : Encoder(cfg, deadline, init_flags, stats) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP8_ENCODER
     return &vpx_codec_vp8_cx_algo;
 #else
@@ -113,12 +113,12 @@ class VP8CodecFactory : public CodecFactory {
  public:
   VP8CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                         const vpx_codec_flags_t flags) const override {
 #if CONFIG_VP8_DECODER
     return new VP8Decoder(cfg, flags);
 #else
@@ -128,10 +128,9 @@ class VP8CodecFactory : public CodecFactory {
 #endif
   }
 
-  virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+                         const unsigned long init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_VP8_ENCODER
     return new VP8Encoder(cfg, deadline, init_flags, stats);
 #else
@@ -143,8 +142,8 @@ class VP8CodecFactory : public CodecFactory {
 #endif
   }
 
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+  vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                       int usage) const override {
 #if CONFIG_VP8_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage);
 #else
@@ -180,7 +179,7 @@ class VP9Decoder : public Decoder {
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_DECODER
     return &vpx_codec_vp9_dx_algo;
 #else
@@ -196,7 +195,7 @@ class VP9Encoder : public Encoder {
       : Encoder(cfg, deadline, init_flags, stats) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
@@ -209,12 +208,12 @@ class VP9CodecFactory : public CodecFactory {
  public:
   VP9CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                         const vpx_codec_flags_t flags) const override {
 #if CONFIG_VP9_DECODER
     return new VP9Decoder(cfg, flags);
 #else
@@ -224,10 +223,9 @@ class VP9CodecFactory : public CodecFactory {
 #endif
   }
 
-  virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+                         const unsigned long init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_VP9_ENCODER
     return new VP9Encoder(cfg, deadline, init_flags, stats);
 #else
@@ -239,8 +237,8 @@ class VP9CodecFactory : public CodecFactory {
 #endif
   }
 
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+  vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                       int usage) const override {
 #if CONFIG_VP9_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
 #else
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index a8ac4f154c..3ccac62b51 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -33,19 +33,19 @@ class IVFVideoSource : public CompressedVideoSource {
         compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
         end_of_file_(false) {}
 
-  virtual ~IVFVideoSource() {
+  ~IVFVideoSource() override {
     delete[] compressed_frame_buf_;
 
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Init() {
+  void Init() override {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
     ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
   }
 
-  virtual void Begin() {
+  void Begin() override {
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
@@ -62,7 +62,7 @@ class IVFVideoSource : public CompressedVideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
@@ -86,11 +86,11 @@ class IVFVideoSource : public CompressedVideoSource {
     }
   }
 
-  virtual const uint8_t *cxdata() const {
+  const uint8_t *cxdata() const override {
     return end_of_file_ ? nullptr : compressed_frame_buf_;
   }
-  virtual size_t frame_size() const { return frame_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  size_t frame_size() const override { return frame_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
diff --git a/test/svc_test.h b/test/svc_test.h
index f1d727fd9d..0026372de5 100644
--- a/test/svc_test.h
+++ b/test/svc_test.h
@@ -36,7 +36,7 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest {
   }
 
  protected:
-  virtual ~OnePassCbrSvc() {}
+  ~OnePassCbrSvc() override {}
 
   virtual void SetConfig(const int num_temporal_layer) = 0;
 
@@ -46,11 +46,11 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest {
   virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
                                        ::libvpx_test::Encoder *encoder);
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder);
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override;
 
   virtual void AssignLayerBitrates();
 
-  virtual void MismatchHook(const vpx_image_t *, const vpx_image_t *) {}
+  void MismatchHook(const vpx_image_t *, const vpx_image_t *) override {}
 
   vpx_svc_extra_cfg_t svc_params_;
   int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
diff --git a/test/video_source.h b/test/video_source.h
index 5ed99d0639..2194126f1f 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -163,35 +163,35 @@ class DummyVideoSource : public VideoSource {
     ReallocImage();
   }
 
-  virtual ~DummyVideoSource() { vpx_img_free(img_); }
+  ~DummyVideoSource() override { vpx_img_free(img_); }
 
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const {
+  vpx_image_t *img() const override {
     return (frame_ < limit_) ? img_ : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { 1, 30 };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   void set_limit(unsigned int limit) { limit_ = limit; }
 
@@ -238,7 +238,7 @@ class RandomVideoSource : public DummyVideoSource {
 
  protected:
   // Reset the RNG to get a matching stream for the second pass
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     rnd_.Reset(seed_);
     FillFrame();
@@ -246,7 +246,7 @@ class RandomVideoSource : public DummyVideoSource {
 
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
-  virtual void FillFrame() {
+  void FillFrame() override {
     if (img_) {
       if (frame_ % 30 < 15) {
         for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index d245926298..6ab50c849f 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -29,16 +29,16 @@ class WebMVideoSource : public CompressedVideoSource {
         webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0),
         end_of_file_(false) {}
 
-  virtual ~WebMVideoSource() {
+  ~WebMVideoSource() override {
     if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file);
     webm_free(webm_ctx_);
     delete vpx_ctx_;
     delete webm_ctx_;
   }
 
-  virtual void Init() {}
+  void Init() override {}
 
-  virtual void Begin() {
+  void Begin() override {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
     ASSERT_NE(vpx_ctx_->file, nullptr)
         << "Input file open failed. Filename: " << file_name_;
@@ -48,7 +48,7 @@ class WebMVideoSource : public CompressedVideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
@@ -74,11 +74,11 @@ class WebMVideoSource : public CompressedVideoSource {
     } while (!webm_ctx_->is_key_frame && !end_of_file_);
   }
 
-  virtual const uint8_t *cxdata() const {
+  const uint8_t *cxdata() const override {
     return end_of_file_ ? nullptr : buf_;
   }
-  virtual size_t frame_size() const { return buf_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  size_t frame_size() const override { return buf_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 71fbf31931..e43e37d9e4 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -27,7 +27,7 @@ class Y4mVideoSource : public VideoSource {
         start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
         framerate_denominator_(0), y4m_() {}
 
-  virtual ~Y4mVideoSource() {
+  ~Y4mVideoSource() override {
     vpx_img_free(img_.get());
     CloseSource();
   }
@@ -51,33 +51,33 @@ class Y4mVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual void Begin() {
+  void Begin() override {
     OpenSource();
     ReadSourceToStart();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const {
+  vpx_image_t *img() const override {
     return (frame_ < limit_) ? img_.get() : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void FillFrame() {
     ASSERT_NE(input_file_, nullptr);
diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h
index 51948c0efb..bb5eec5bb8 100644
--- a/test/yuv_video_source.h
+++ b/test/yuv_video_source.h
@@ -35,12 +35,12 @@ class YUVVideoSource : public VideoSource {
     SetSize(width, height, format);
   }
 
-  virtual ~YUVVideoSource() {
+  ~YUVVideoSource() override {
     vpx_img_free(img_);
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Begin() {
+  void Begin() override {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_NE(input_file_, nullptr)
@@ -53,28 +53,28 @@ class YUVVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const {
+  vpx_image_t *img() const override {
     return (frame_ < limit_) ? img_ : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void SetSize(unsigned int width, unsigned int height,
                        vpx_img_fmt format) {

From 4f19de38265dd04559059a030cc8692c2e19685d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 26 Jul 2023 19:03:04 -0700
Subject: [PATCH 774/926] resize_test: prefer 'override' to 'virtual'

Update functions in WRITE_COMPRESSED_STREAM blocks, which are disabled
by default. This caused them to be missed in:
84e6b7ab0 test/*.cc: prefer 'override' to 'virtual'

Change-Id: I0e462263f19c15eb0a30d0c0f4e145062f789489
---
 test/resize_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index 7d01bbd3d5..fd1c2a92de 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -387,7 +387,7 @@ class ResizeInternalTest : public ResizeTest {
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -732,7 +732,7 @@ class ResizeCspTest : public ResizeTest {
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.

From 70fc7563830e9b516db903617797d7537701e8c5 Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Mon, 14 Nov 2022 16:47:33 +0900
Subject: [PATCH 775/926] quantize: reduce parameters

Pass macroblock_plane and ScanOrder instead of looking up the values
beforehand. Avoids pushing arguments to the stack.

Change-Id: I22df6f645eb1a1d89ba5a4d9bc58acb77af51aa9
---
 test/vp9_quantize_test.cc                 | 102 ++++++++--------------
 vp9/encoder/vp9_encodemb.c                |  60 +++++--------
 vp9/encoder/vp9_rdopt.c                   |  28 +++---
 vpx_dsp/arm/highbd_quantize_neon.c        |  18 ++--
 vpx_dsp/arm/quantize_neon.c               |  24 +++--
 vpx_dsp/quantize.c                        |  27 +++---
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |   8 +-
 vpx_dsp/x86/highbd_quantize_intrin_avx2.c |  32 +++----
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c |  16 ++--
 vpx_dsp/x86/quantize_avx.c                |  18 ++--
 vpx_dsp/x86/quantize_avx2.c               |  41 +++++----
 vpx_dsp/x86/quantize_sse2.c               |  17 ++--
 vpx_dsp/x86/quantize_sse2.h               |  15 ++--
 vpx_dsp/x86/quantize_ssse3.c              |  14 ++-
 14 files changed, 172 insertions(+), 248 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 5ba90a21bc..26ea9af159 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -42,30 +42,11 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                              const macroblock_plane *mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
-                             const struct ScanOrder *scan_order);
+                             const struct ScanOrder *const scan_order);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
-// Wrapper which takes a macroblock_plane.
-typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count,
-                                 const int16_t *zbin, const int16_t *round,
-                                 const int16_t *quant,
-                                 const int16_t *quant_shift, tran_low_t *qcoeff,
-                                 tran_low_t *dqcoeff, const int16_t *dequant,
-                                 uint16_t *eob, const int16_t *scan,
-                                 const int16_t *iscan);
-
-template <QuantizeBaseFunc fn>
-void QuantWrapper(const tran_low_t *coeff, intptr_t count,
-                  const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
-                  tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
-                  const struct ScanOrder *const scan_order) {
-  fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
-     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan,
-     scan_order->iscan);
-}
-
 // Wrapper for 32x32 version which does not use count
 typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
                                   const macroblock_plane *const mb_plane,
@@ -542,19 +523,16 @@ using std::make_tuple;
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
-                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                    &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
-                   false),
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
         make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
                    &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
                    VPX_BITS_8, 32, false),
@@ -568,9 +546,8 @@ INSTANTIATE_TEST_SUITE_P(
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
+    ::testing::Values(make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true)));
@@ -580,9 +557,8 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_ssse3>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
+    ::testing::Values(make_tuple(vpx_quantize_b_ssse3, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
                       make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
                                  &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
                                  VPX_BITS_8, 32, false),
@@ -597,9 +573,8 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_AVX
 INSTANTIATE_TEST_SUITE_P(
     AVX, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_avx>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
+    ::testing::Values(make_tuple(vpx_quantize_b_avx, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
                       make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
                                  &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
                                  VPX_BITS_8, 32, false)));
@@ -618,17 +593,14 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
                    &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
                    32, true),
-        make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
-                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+        make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
@@ -650,9 +622,8 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true),
-                      make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
+                      make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
                       make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
                                  &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
                                  VPX_BITS_8, 32, false)));
@@ -664,17 +635,14 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
-                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
-                   false),
-        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
-                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
@@ -695,9 +663,8 @@ INSTANTIATE_TEST_SUITE_P(
 #else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
-                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
-                                 16, false),
+    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
                       make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
                                  &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
                                  VPX_BITS_8, 32, false),
@@ -740,8 +707,7 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&QuantWrapper<vpx_quantize_b_c>,
-                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
         make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
                    &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 946a1c3ee8..0ddf8d3c9f 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -517,22 +517,19 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
     }
     return;
@@ -547,22 +544,19 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
-                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                     scan_order->iscan);
+      vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
-                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                     scan_order->iscan);
+      vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
-                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                     scan_order->iscan);
+      vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
   }
 }
@@ -904,9 +898,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order);
         }
         if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -922,9 +915,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                scan_order);
         }
         if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -941,9 +933,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                scan_order);
         }
         if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -982,9 +973,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     case TX_16X16:
       if (!x->skip_recode) {
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
       if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -995,9 +985,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     case TX_8X8:
       if (!x->skip_recode) {
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
       if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -1012,9 +1001,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
           x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
       if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 7b607b643a..fc06967105 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1132,9 +1132,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             vp9_highbd_fwht4x4(src_diff, coeff, 8);
-            vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                                  p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                  eob, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
@@ -1152,9 +1151,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
               vpx_highbd_fdct4x4(src_diff, coeff, 8);
             else
               vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
-            vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                                  p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                  eob, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             distortion += vp9_highbd_block_error_dispatch(
@@ -1239,9 +1237,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fwht4x4(src_diff, coeff, 8);
-          vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                         so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                         so);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1256,9 +1253,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-          vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                         so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                         so);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1710,14 +1706,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                      coeff, 8);
 #if CONFIG_VP9_HIGHBITDEPTH
-      vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                            p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                            so->scan, so->iscan);
+      vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            so);
       thisdistortion += vp9_highbd_block_error_dispatch(
           coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
 #else
-      vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift,
-                     qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan);
+      vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so);
       thisdistortion +=
           vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index d2a7add60d..0a0ab5e58a 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -96,26 +96,25 @@ highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
+  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   // High half has identical elements, but we can reconstruct it from the low
   // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
   // vector
-  int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr));
-  int32x4_t round = vmovl_s16(vld1_s16(round_ptr));
+  int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin));
+  int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round));
   // Extend the quant, quant_shift vectors to ones of 32-bit elements
   // scale to high-half, so we can use vqdmulhq_s32
-  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
-  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15);
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15);
   int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
 
   // Process first 8 values which include a dc component.
@@ -180,7 +179,6 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
   (void)n_coeffs;
-  (void)scan;
 }
 
 static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index 5a76065549..e2351fa2cc 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -71,20 +71,19 @@ quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
+  int16_t const *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
-  int16x8_t zbin = vld1q_s16(zbin_ptr);
-  int16x8_t round = vld1q_s16(round_ptr);
-  int16x8_t quant = vld1q_s16(quant_ptr);
-  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t zbin = vld1q_s16(mb_plane->zbin);
+  int16x8_t round = vld1q_s16(mb_plane->round);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
@@ -145,9 +144,6 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // VPX_ARCH_AARCH64
-  // Need these here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)scan;
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -219,7 +215,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const struct ScanOrder *scan_order) {
+                               const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 7dff8c7a87..dee12bae76 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -116,15 +116,17 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
 #endif
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      const struct macroblock_plane *const mb_plane,
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
+                      const struct ScanOrder *const scan_order) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -164,16 +166,17 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
+                             const struct macroblock_plane *mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+                             const struct ScanOrder *const scan_order) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -214,7 +217,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
                             const struct macroblock_plane *mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const struct ScanOrder *scan_order) {
+                            const struct ScanOrder *const scan_order) {
   const int n_coeffs = 32 * 32;
   const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
                          ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index f20f4e0454..21ea1c8d5b 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -722,17 +722,17 @@ ()
 # Quantization
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index fbebd7db1c..35ca554049 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -28,17 +28,15 @@ static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
   }
 }
 
-static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
-                                     const int16_t *round_ptr,
-                                     const int16_t *quant_ptr,
-                                     const int16_t *dequant_ptr,
-                                     const int16_t *quant_shift_ptr,
-                                     __m256i *qp, int log_scale) {
-  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
-  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
-  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+static VPX_FORCE_INLINE void init_qp(
+    const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr,
+    __m256i *qp, int log_scale) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin);
+  const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant);
   const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
-  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+  const __m128i quant_shift =
+      _mm_loadu_si128((const __m128i *)mb_plane->quant_shift);
   init_one_qp(&zbin, &qp[0]);
   init_one_qp(&round, &qp[1]);
   init_one_qp(&quant, &qp[2]);
@@ -136,19 +134,16 @@ static VPX_FORCE_INLINE void quantize(const __m256i *qp,
 }
 
 void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   const int step = 8;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
-  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+  init_qp(mb_plane, dequant_ptr, qp, 0);
 
   quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
 
@@ -233,8 +228,7 @@ void vpx_highbd_quantize_b_32x32_avx2(
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
 
-  init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
-          mb_plane->quant_shift, qp, 1);
+  init_qp(mb_plane, dequant_ptr, qp, 1);
 
   quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
 
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index a5d874f3bc..adae60756d 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -18,18 +18,19 @@
 #include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_block.h"
 
-#if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
   __m128i zbins[2];
   __m128i nzbins[2];
+  const int16_t *iscan = scan_order->iscan;
+  const int16_t *zbin_ptr = mb_plane->zbin;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
                            (int)zbin_ptr[0]);
@@ -40,8 +41,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
-  (void)scan;
-
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
 
@@ -152,4 +151,3 @@ void vpx_highbd_quantize_b_32x32_sse2(
   }
   *eob_ptr = eob;
 }
-#endif
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 6837a5cf28..98bf1686cb 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -23,15 +23,14 @@
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        const int16_t *zbin_ptr, const int16_t *round_ptr,
-                        const int16_t *quant_ptr,
-                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan) {
+                        const struct macroblock_plane *mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -40,12 +39,9 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-
   *eob_ptr = 0;
 
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -146,7 +142,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
                               const struct macroblock_plane *mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const struct ScanOrder *scan_order) {
+                              const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 3d97b3fdae..189b083f68 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -17,11 +17,11 @@
 #include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
-    const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
-    __m256i *round, const int16_t *quant_ptr, __m256i *quant,
-    const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+    const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant,
     __m256i *shift, int log_scale) {
-  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin));
   *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
   if (log_scale > 0) {
     const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
@@ -32,7 +32,8 @@ static VPX_FORCE_INLINE void load_b_values_avx2(
   // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
   *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
 
-  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round));
   *round = _mm256_permute4x64_epi64(*round, 0x54);
   if (log_scale > 0) {
     const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
@@ -40,12 +41,14 @@ static VPX_FORCE_INLINE void load_b_values_avx2(
     *round = _mm256_srai_epi16(*round, log_scale);
   }
 
-  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant));
   *quant = _mm256_permute4x64_epi64(*quant, 0x54);
   *dequant =
       _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
   *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
-  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->quant_shift));
   *shift = _mm256_permute4x64_epi64(*shift, 0x54);
 }
 
@@ -153,20 +156,17 @@ static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
 }
 
 void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const struct macroblock_plane *mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
-  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
-                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
-                     &v_quant_shift, 0);
+  load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+                     &v_dequant, &v_quant_shift, 0);
   // Do DC and first 15 AC.
   v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
                             &v_dequant, &v_round, &v_zbin, &v_quant_shift);
@@ -256,15 +256,14 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
                                const struct macroblock_plane *mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const struct ScanOrder *scan_order) {
+                               const struct ScanOrder *const scan_order) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
   const int16_t *iscan = scan_order->iscan;
 
-  load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
-                     mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
-                     mb_plane->quant_shift, &v_quant_shift, 1);
+  load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+                     &v_dequant, &v_quant_shift, 1);
 
   // Do DC and first 15 AC.
   v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 9533e7916d..64838eaa7d 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -16,16 +16,16 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
@@ -33,11 +33,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan;
-
   // Setup global values.
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index fe42fee018..6de75e0568 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -17,17 +17,16 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/encoder/vp9_block.h"
 
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
-                                 const int16_t *round_ptr, __m128i *round,
-                                 const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_b_values(const struct macroblock_plane *const mb_plane,
+                                 __m128i *zbin, __m128i *round, __m128i *quant,
                                  const int16_t *dequant_ptr, __m128i *dequant,
-                                 const int16_t *shift_ptr, __m128i *shift) {
-  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+                                 __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
   *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
   *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
 }
 
 static INLINE void load_b_values32x32(
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 641f23298b..7f085566dd 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -20,14 +20,13 @@
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const int16_t *zbin_ptr, const int16_t *round_ptr,
-                          const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr,
+                          const struct macroblock_plane *mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -35,10 +34,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan;
-
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -113,7 +109,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
                                 const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const struct ScanOrder *scan_order) {
+                                const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index;
   const int16_t *iscan = scan_order->iscan;

From 7c7ab9165a9b0163a6cab72f538ab227b62a65fd Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Fri, 28 Jul 2023 19:37:48 +0900
Subject: [PATCH 776/926] quantize_fp: reduce parameters

apply similar steps as to the other quantize functions to switch to
macroblock_plane and ScanOrder

Change-Id: I486d653326aaf52ffd3beafd2e891ba6a5d57ef3
---
 test/vp9_quantize_test.cc                | 31 ++++++------
 vp9/common/vp9_rtcd_defs.pl              | 10 ++--
 vp9/encoder/arm/neon/vp9_quantize_neon.c | 60 +++++++++++------------
 vp9/encoder/vp9_encodemb.c               | 37 ++++++--------
 vp9/encoder/vp9_pickmode.c               | 15 +++---
 vp9/encoder/vp9_quantize.c               | 42 +++++++++-------
 vp9/encoder/vp9_tpl_model.c              | 15 +++---
 vp9/encoder/x86/vp9_quantize_avx2.c      | 62 ++++++++++++------------
 vp9/encoder/x86/vp9_quantize_sse2.c      | 11 +++--
 vp9/encoder/x86/vp9_quantize_ssse3.c     | 21 ++++----
 vpx_dsp/x86/quantize_sse2.h              |  8 +--
 11 files changed, 151 insertions(+), 161 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 26ea9af159..f6984bd6fa 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -66,18 +66,17 @@ void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
 
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
-                               const int16_t *round, const int16_t *quant,
+                               const macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                const int16_t *dequant, uint16_t *eob,
-                               const int16_t *scan, const int16_t *iscan);
+                               const struct ScanOrder *const scan_order);
 
 template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
                     const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
                     tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
                     const struct ScanOrder *const scan_order) {
-  fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
-     dequant, eob, scan_order->scan, scan_order->iscan);
+  fn(coeff, count, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -315,14 +314,16 @@ void VP9QuantizeTest::Speed(bool is_median) {
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
 inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const struct macroblock_plane *const mb_plane,
                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                        const int16_t *scan, const int16_t *iscan,
+                        const struct ScanOrder *const scan_order,
                         int is_32x32) {
   int i, eob = -1;
   const int thr = dequant_ptr[1] >> (1 + is_32x32);
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
@@ -389,21 +390,21 @@ inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const struct macroblock_plane *mb_plane,
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr,
-              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+                      const struct ScanOrder *const scan_order) {
+  quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr,
+              dequant_ptr, eob_ptr, scan_order, 0);
 }
 
 void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const struct macroblock_plane *mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr,
-              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
+                            const struct ScanOrder *const scan_order) {
+  quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr,
+              dequant_ptr, eob_ptr, scan_order, 1);
 }
 
 TEST_P(VP9QuantizeTest, OperationCheck) {
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1a4140b38b..8e00c4581d 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -23,7 +23,9 @@ ()
 
 /* Encoder forward decls */
 struct macroblock;
+struct macroblock_plane;
 struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -129,10 +131,10 @@ ()
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 specialize qw/vp9_block_error_fp neon avx2 sse2/;
 
-add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
 specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
 
-add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
 specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -192,10 +194,10 @@ ()
 
   # ENCODEMB INVOKE
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
   specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order" ;
   specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
 
   # fdct functions
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 97ab13628e..e8cb78dbfc 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -16,6 +16,7 @@
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -68,13 +69,11 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
 #endif  // VPX_ARCH_AARCH64
 }
 
-static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
-                                            const int16_t *quant_ptr,
-                                            const int16_t *dequant_ptr,
-                                            int16x8_t *round, int16x8_t *quant,
-                                            int16x8_t *dequant) {
-  *round = vld1q_s16(round_ptr);
-  *quant = vld1q_s16(quant_ptr);
+static VPX_FORCE_INLINE void load_fp_values(
+    const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr,
+    int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) {
+  *round = vld1q_s16(mb_plane->round_fp);
+  *quant = vld1q_s16(mb_plane->quant_fp);
   *dequant = vld1q_s16(dequant_ptr);
 }
 
@@ -118,19 +117,18 @@ static VPX_FORCE_INLINE void quantize_fp_8(
 }
 
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const struct macroblock_plane *mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   int i;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
   int16x8_t v_round, v_quant, v_dequant;
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
-  load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
-                 &v_dequant);
+  load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant);
   // process dc and the first seven ac coeffs
   quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
                 dqcoeff_ptr, &v_eobmax);
@@ -187,21 +185,20 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8(
 }
 
 void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   int16x8_t eob_max = vdupq_n_s16(-1);
   // ROUND_POWER_OF_TWO(round_ptr[], 1)
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant_fp);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
   // dequant >> 2 is used similar to zbin as a threshold.
   int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
-  (void)scan;
   (void)count;
 
   // Process dc and the first seven ac coeffs.
@@ -258,23 +255,21 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const int16_t *round_ptr,
-                                 const int16_t *quant_ptr,
+                                 const struct macroblock_plane *mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const int16_t *iscan) {
+                                 const struct ScanOrder *const scan_order) {
   const int16x4_t v_zero = vdup_n_s16(0);
-  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
   const int16x4_t v_dequant = vld1_s16(dequant_ptr);
-  const int16x4_t v_round = vld1_s16(round_ptr);
+  const int16x4_t v_round = vld1_s16(mb_plane->round_fp);
   int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
   int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
   int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // DC and first 3 AC
   v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
@@ -349,22 +344,21 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vp9_highbd_quantize_fp_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
-    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
-    const int16_t *iscan) {
-  const int16x4_t v_quant = vld1_s16(quant_ptr);
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
   const int16x4_t v_dequant = vld1_s16(dequant_ptr);
   const int16x4_t v_zero = vdup_n_s16(0);
   const int16x4_t v_round =
-      vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+      vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14));
   int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
   int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
   int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // DC and first 3 AC
   v_mask_lo =
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 0ddf8d3c9f..eded9f5c42 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -367,28 +367,24 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp,
-                                     qcoeff, dqcoeff, pd->dequant, eob,
-                                     scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff,
+                                     pd->dequant, eob, scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
-                               dqcoeff, pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
-                               dqcoeff, pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
-                               dqcoeff, pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
     }
     return;
@@ -398,26 +394,25 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff,
-                            dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
-                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
-                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
 
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
-                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
       break;
   }
 }
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 4a92802dcc..6f2524b36e 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -786,22 +786,19 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
         switch (tx_size) {
           case TX_16X16:
             vpx_hadamard_16x16(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
-                            dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
           case TX_8X8:
             vpx_hadamard_8x8(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
-                            dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
           default:
             assert(tx_size == TX_4X4);
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-            vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
-                            dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
         }
         *skippable &= (*eob == 0);
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 115c66723d..19edf166d3 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -15,6 +15,7 @@
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -22,12 +23,14 @@
 #include "vp9/encoder/vp9_rd.h"
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       const struct macroblock_plane *const mb_plane,
                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                       const int16_t *scan, const int16_t *iscan) {
+                       const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -53,15 +56,15 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *round_ptr,
-                              const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                              tran_low_t *dqcoeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct ScanOrder *const scan_order) {
   int i;
   int eob = -1;
-
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -86,12 +89,14 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const struct macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+                             const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -118,13 +123,14 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
-    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
-    const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 8d203bbf4f..0a81175f73 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -497,18 +497,15 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
-                                 qcoeff, dqcoeff, pd->dequant, eob,
-                                 scan_order->scan, scan_order->iscan);
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff,
+                                 pd->dequant, eob, scan_order);
   } else {
-    vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                          dqcoeff, pd->dequant, eob, scan_order->scan,
-                          scan_order->iscan);
+    vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+                          scan_order);
   }
 #else
-  vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                        dqcoeff, pd->dequant, eob, scan_order->scan,
-                        scan_order->iscan);
+  vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+                        scan_order);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index e6aa71d58a..62af3a9212 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -16,6 +16,8 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 // Zero fill 8 positions in the output buffer.
 static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
@@ -29,11 +31,13 @@ static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
 }
 
 static VPX_FORCE_INLINE void load_fp_values_avx2(
-    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
-    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
-  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+    const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->round_fp));
   *round = _mm256_permute4x64_epi64(*round, 0x54);
-  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->quant_fp));
   *quant = _mm256_permute4x64_epi64(*quant, 0x54);
   *dequant =
       _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
@@ -98,13 +102,13 @@ static VPX_FORCE_INLINE void quantize_fp_16(
 }
 
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const struct macroblock_plane *mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   __m256i round, quant, dequant, thr;
   __m256i eob_max = _mm256_setzero_si256();
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -113,8 +117,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   n_coeffs = -n_coeffs;
 
   // Setup global values
-  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
-                      &dequant);
+  load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
   thr = _mm256_setzero_si256();
 
   quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
@@ -203,14 +206,13 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_16(
 }
 
 void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   __m256i round, quant, dequant, thr;
   __m256i eob_max = _mm256_setzero_si256();
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -219,8 +221,7 @@ void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   n_coeffs = -n_coeffs;
 
   // Setup global values
-  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
-                      &dequant);
+  load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
   thr = _mm256_srli_epi16(dequant, 2);
   quant = _mm256_slli_epi16(quant, 1);
   {
@@ -286,10 +287,10 @@ static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
 }
 
 static VPX_FORCE_INLINE void highbd_load_fp_values(
-    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
-    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
-  *round = highbd_init_256(round_ptr);
-  *quant = highbd_init_256(quant_ptr);
+    const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = highbd_init_256(mb_plane->round_fp);
+  *quant = highbd_init_256(mb_plane->quant_fp);
   *dequant = highbd_init_256(dequant_ptr);
 }
 
@@ -325,16 +326,15 @@ static VPX_FORCE_INLINE void highbd_quantize_fp(
 }
 
 void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const int16_t *round_ptr,
-                                 const int16_t *quant_ptr,
+                                 const struct macroblock_plane *mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const int16_t *iscan) {
+                                 const struct ScanOrder *const scan_order) {
   const int step = 8;
   __m256i round, quant, dequant;
   __m256i eob_max = _mm256_setzero_si256();
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -343,8 +343,7 @@ void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   n_coeffs = -n_coeffs;
 
   // Setup global values
-  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
-                        &dequant);
+  highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
 
   highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
                      iscan + n_coeffs, qcoeff_ptr + n_coeffs,
@@ -391,14 +390,14 @@ static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
 }
 
 void vp9_highbd_quantize_fp_32x32_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
-    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
-    const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
   const int step = 8;
   __m256i round, quant, dequant, thr;
   __m256i eob_max = _mm256_setzero_si256();
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -407,8 +406,7 @@ void vp9_highbd_quantize_fp_32x32_avx2(
   n_coeffs = -n_coeffs;
 
   // Setup global values
-  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
-                        &dequant);
+  highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
   thr = _mm256_srli_epi32(dequant, 2);
   // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
   // calculating the zbin mask.
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index c877234436..67f03eb310 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -17,12 +17,14 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const struct macroblock_plane *mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
@@ -31,11 +33,10 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
   __m128i qcoeff0, qcoeff1;
   __m128i eob;
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // Setup global values.
-  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c
index d35004e370..c94c8dbb4c 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.c
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -17,12 +17,14 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const struct macroblock_plane *mb_plane,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                           const int16_t *scan, const int16_t *iscan) {
+                           const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
@@ -31,11 +33,10 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i coeff0, coeff1;
   __m128i qcoeff0, qcoeff1;
   __m128i eob;
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // Setup global values.
-  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -119,12 +120,11 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const int16_t *round_ptr,
-                                 const int16_t *quant_ptr,
+                                 const struct macroblock_plane *mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const int16_t *iscan) {
+                                 const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one_s16 = _mm_set1_epi16(1);
   __m128i thr;
@@ -134,11 +134,10 @@ void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i coeff0, coeff1;
   __m128i qcoeff0, qcoeff1;
   __m128i eob;
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // Setup global values.
-  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
   // The 32x32 halves round.
   round = _mm_add_epi16(round, one_s16);
   round = _mm_srli_epi16(round, 1);
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 6de75e0568..82c755a0cf 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -56,12 +56,12 @@ static INLINE void load_b_values32x32(
   *shift = _mm_slli_epi16(*shift, 1);
 }
 
-static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
-                                  const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_fp_values(const struct macroblock_plane *mb_plane,
+                                  __m128i *round, __m128i *quant,
                                   const int16_t *dequant_ptr,
                                   __m128i *dequant) {
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *round = _mm_load_si128((const __m128i *)mb_plane->round_fp);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp);
   *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
 }
 

From e7a4730fcc8f895bd08f8522fb6a72312e937c87 Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Fri, 28 Jul 2023 20:21:31 +0900
Subject: [PATCH 777/926] remove incorrect (void)

n_coeffs is used in this function

Change-Id: I5f5d2933304bb636a33e0fa294b4526edb65a08d
---
 vpx_dsp/arm/highbd_quantize_neon.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 0a0ab5e58a..c2ad34a695 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -176,9 +176,6 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // VPX_ARCH_AARCH64
-  // Need these here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)n_coeffs;
 }
 
 static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {

From 22818907d2597069ffc3400e80a6d5ad4df0097d Mon Sep 17 00:00:00 2001
From: Johann <johann@duck.com>
Date: Sat, 29 Jul 2023 05:44:56 +0900
Subject: [PATCH 778/926] normalize *const in rtcd

Change-Id: Iece50143b43263c0c8f90299bedd7d2a5b9aa56b
---
 vp9/common/vp9_rtcd_defs.pl  |   8 +--
 vpx_dsp/vpx_dsp_rtcd_defs.pl | 112 +++++++++++++++++------------------
 2 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 8e00c4581d..980827b15a 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -131,10 +131,10 @@ ()
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 specialize qw/vp9_block_error_fp neon avx2 sse2/;
 
-add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
 specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
 
-add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
 specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -194,10 +194,10 @@ ()
 
   # ENCODEMB INVOKE
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order" ;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order" ;
   specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
 
   # fdct functions
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 21ea1c8d5b..0f577398cf 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -722,17 +722,17 @@ ()
 # Quantization
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
@@ -928,82 +928,82 @@ ()
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/;
 
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_64x64x4d neon avx2 sse2/;
 
-add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_64x32x4d neon avx2 sse2/;
 
-add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_32x64x4d neon avx2 sse2/;
 
-add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_32x32x4d neon avx2 sse2/;
 
-add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_32x16x4d neon avx2 sse2/;
 
-add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_16x32x4d neon sse2/;
 
-add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_16x16x4d neon sse2/;
 
-add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_16x8x4d neon sse2/;
 
-add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_8x16x4d neon sse2/;
 
-add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_8x8x4d neon sse2/;
 
-add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_8x4x4d neon/;
 
-add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_4x8x4d neon sse2/;
 
-add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_4x4x4d neon/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
@@ -1162,82 +1162,82 @@ ()
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_8x4x4d neon/;
 
-  add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/;
 
-  add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad_skip_4x4x4d neon/;
 
   #

From 2b82efa769d9cfa70b856e2015bf82a1dc4cd8dc Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Tue, 1 Aug 2023 20:58:18 -0700
Subject: [PATCH 779/926] Add a 10-bit test file

Added a 10-bit test file for VP9 end-to-end c vs SIMD bit-
exactness test.

BUG=webm:1800

Change-Id: I4a864f1a740abee27049d68231adf2ec308f9a96
---
 test/test-data.mk   | 1 +
 test/test-data.sha1 | 1 +
 2 files changed, 2 insertions(+)

diff --git a/test/test-data.mk b/test/test-data.mk
index 62a9d6ef14..9eabffae3e 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -29,6 +29,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += crowd_run_360p_10_150f.y4m
 
 # Test vectors
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 55f92a25df..a9decc6b6b 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -870,3 +870,4 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
 ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
 8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv
+ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m

From 6075b1a36f8090b309a2620075ab832f63793106 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Tue, 13 Jun 2023 16:02:58 +0530
Subject: [PATCH 780/926] Add test to check bit exactness of C and SIMD in VP9
 encoder

This CL adds a shell script to test bit exactness of C and SIMD
VP9 encoder for x86 platform.

As C Vs NEON encoding outputs are not bit-exact (BUG=webm:1809),
ARM tests are currently disabled.

BUG=webm:1800

Change-Id: Iffcc70863e8cf83ccb5bc5be73e8866165697358
---
 test/examples.sh             |   2 +-
 test/vp9_c_vs_simd_encode.sh | 411 +++++++++++++++++++++++++++++++++++
 2 files changed, 412 insertions(+), 1 deletion(-)
 create mode 100644 test/vp9_c_vs_simd_encode.sh

diff --git a/test/examples.sh b/test/examples.sh
index 629f04239c..c15a367f3c 100755
--- a/test/examples.sh
+++ b/test/examples.sh
@@ -15,7 +15,7 @@
 example_tests=$(ls $(dirname $0)/*.sh)
 
 # List of script names to exclude.
-exclude_list="examples stress tools_common"
+exclude_list="examples stress tools_common vp9_c_vs_simd_encode"
 
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
new file mode 100644
index 0000000000..76df049d9a
--- /dev/null
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -0,0 +1,411 @@
+#!/bin/sh
+##
+##  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This script checks the bit exactness between C and SIMD
+##  implementations of VP9 encoder.
+
+TEST_BITRATES="1600 6400"
+PRESETS="good rt"
+TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input"
+OUT_FILE_SUFFIX=".ivf"
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd)
+devnull='> /dev/null 2>&1'
+
+# Clips used in test.
+YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_480P_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_640_480_30.yuv"
+Y4M_360P_10BIT_INPUT="${LIBVPX_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
+# Number of frames to test.
+VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=20
+
+# Create a temporary directory for output files.
+if [ -n "${TMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+  VPX_TEST_TEMP_ROOT=/tmp
+fi
+
+VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$"
+
+if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \
+   [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then
+  echo "${0##*/}: Cannot create output directory, giving up."
+  echo "${0##*/}:   VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}"
+  exit 1
+fi
+
+elog() {
+  echo "$@" 1>&2
+}
+
+# Echoes path to $1 when it's executable and exists in ${VPX_TEST_OUTPUT_DIR},
+# or an empty string. Caller is responsible for testing the string once the
+# function returns.
+vp9_enc_tool_path() {
+  local target="$1"
+  local tool_path="${VPX_TEST_OUTPUT_DIR}/build_target_${target}/vpxenc"
+
+  if [ ! -x "${tool_path}" ]; then
+    tool_path=""
+  fi
+  echo "${tool_path}"
+}
+
+# Environment check: Make sure input and source directories are available.
+vp9_c_vs_simd_enc_verify_environment () {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_720P_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -d "$LIBVPX_SOURCE_DIR" ]; then
+    elog "LIBVPX_SOURCE_DIR does not exist."
+    return 1
+  fi
+}
+
+cleanup() {
+  rm -rf  ${VPX_TEST_OUTPUT_DIR}
+}
+
+# Echo VPX_SIMD_CAPS_MASK for different instruction set architecture.
+avx512f() {
+   echo "0x1FF"
+}
+
+avx2() {
+   echo "0x0FF"
+}
+
+sse4_1() {
+   echo "0x03F"
+}
+
+ssse3() {
+   echo "0x01F"
+}
+
+sse2() {
+   echo "0x007"
+}
+
+# Echo clip details to be used as input to vpxenc.
+yuv_raw_input() {
+  echo ""${YUV_RAW_INPUT}"
+       --width=352
+       --height=288
+       --bit-depth=8
+       --profile=0"
+}
+
+yuv_480p_raw_input() {
+  echo ""${YUV_480P_RAW_INPUT}"
+       --width=640
+       --height=480
+       --bit-depth=8
+       --profile=0"
+}
+
+y4m_720p_input() {
+  echo ""${Y4M_720P_INPUT}"
+       --bit-depth=8
+       --profile=0"
+}
+
+y4m_360p_10bit_input() {
+  echo ""${Y4M_360P_10BIT_INPUT}"
+       --bit-depth=10
+       --profile=2"
+}
+
+has_x86_isa_extn() {
+  instruction_set=$1
+  grep -q "$instruction_set" /proc/cpuinfo
+  if [ $? -eq 1 ]; then
+    return 1
+  fi
+}
+
+# Echo good encode params for use with VP9 encoder.
+vp9_encode_good_params() {
+  echo "--codec=vp9 \
+  --good \
+  --test-decode=fatal \
+  --ivf \
+  --threads=1 \
+  --static-thresh=0 \
+  --tile-columns=0 \
+  --end-usage=vbr \
+  --kf-max-dist=160 \
+  --kf-min-dist=0 \
+  --lag-in-frames=19 \
+  --max-q=63 \
+  --min-q=0 \
+  --passes=2 \
+  --undershoot-pct=100 \
+  --overshoot-pct=100 \
+  --verbose \
+  --auto-alt-ref=1 \
+  --drop-frame=0 \
+  --bias-pct=50 \
+  --minsection-pct=0 \
+  --maxsection-pct=2000 \
+  --arnr-maxframes=7 \
+  --arnr-strength=5 \
+  --sharpness=0 \
+  --frame-parallel=0"
+}
+
+# Echo realtime encode params for use with VP9 encoder.
+vp9_encode_rt_params() {
+  echo "--codec=vp9 \
+  --rt \
+  --test-decode=fatal \
+  --ivf \
+  --threads=1 \
+  --static-thresh=0 \
+  --tile-columns=0 \
+  --tile-rows=0 \
+  --end-usage=cbr \
+  --kf-max-dist=90000 \
+  --lag-in-frames=0 \
+  --max-q=58 \
+  --min-q=2 \
+  --passes=1 \
+  --undershoot-pct=50 \
+  --overshoot-pct=50 \
+  --verbose \
+  --row-mt=0 \
+  --buf-sz=1000 \
+  --buf-initial-sz=500 \
+  --buf-optimal-sz=600 \
+  --max-intra-rate=300 \
+  --resize-allowed=0 \
+  --noise-sensitivity=0 \
+  --aq-mode=3 \
+  --error-resilient=0"
+}
+
+# Configures for the given target in VPX_TEST_OUTPUT_DIR/build_target_${target}
+# directory.
+vp9_enc_build() {
+  local target=$1
+  local configure="$2"
+  local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
+  mkdir -p $tmp_build_dir
+  cd $tmp_build_dir
+
+  echo "Building target: ${target}"
+  local config_args="--disable-install-docs \
+             --enable-unit-tests \
+             --enable-debug \
+             --enable-postproc \
+             --enable-vp9-postproc \
+             --enable-vp9-temporal-denoising \
+             --enable-vp9-highbitdepth"
+
+  eval "$configure" --target="${target}" "${config_args}" ${devnull}
+  eval make -j$(nproc) ${devnull}
+  echo "Done building target: ${target}"
+}
+
+compare_enc_output() {
+  local target=$1
+  local cpu=$2
+  local clip=$3
+  local bitrate=$4
+  local preset=$5
+  diff ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+       ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} > /dev/null
+  if [ $? -eq 1 ]; then
+    elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset"
+    return 1
+  fi
+}
+
+vp9_enc_test() {
+  local encoder="$1"
+  local target=$2
+  if [ -z "$(vp9_enc_tool_path "${target}")" ]; then
+    elog "vpxenc not found. It must exist in ${VPX_TEST_OUTPUT_DIR}/build_target_${target} path"
+    return 1
+  fi
+
+  for preset in ${PRESETS}; do
+    if [ "${preset}" = "good" ]; then
+      local max_cpu_used=5
+      local test_params=vp9_encode_good_params
+    elif [ "${preset}" = "rt" ]; then
+      local max_cpu_used=9
+      local test_params=vp9_encode_rt_params
+    else
+      elog "Invalid preset"
+      return 1
+    fi
+
+    for cpu in $(seq 0 $max_cpu_used); do
+      for clip in ${TEST_CLIPS}; do
+        for bitrate in ${TEST_BITRATES}; do
+          eval "${encoder}" $($clip) $($test_params) \
+          "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \
+          "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \
+          ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+          ${devnull}
+
+          if [ "${target}" != "generic-gnu" ]; then
+            compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}
+            if [ $? -eq 1 ]; then
+              return 1
+            fi
+          fi
+        done
+      done
+    done
+  done
+}
+
+vp9_test_generic() {
+  local configure="$LIBVPX_SOURCE_DIR/configure"
+  local target="generic-gnu"
+
+  echo "Build for: ${target}"
+  vp9_enc_build ${target} ${configure}
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  vp9_enc_test $encoder "${target}"
+}
+
+# This function encodes VP9 bitstream by enabling SSE2, SSSE3, SSE4_1, AVX2, AVX512f as there are
+# no functions with MMX, SSE, SSE3 and AVX specialization.
+# The value of environment variable 'VPX_SIMD_CAPS' controls enabling of different instruction
+# set extension optimizations. The value of the flag 'VPX_SIMD_CAPS' and the corresponding
+# instruction set extension optimization enabled are as follows:
+# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
+#   1     1    1    1      1    1    1    1   1  -> 0x1FF -> Enable AVX512 and lower variants
+#   0     1    1    1      1    1    1    1   1  -> 0x0FF -> Enable AVX2 and lower variants
+#   0     0    1    1      1    1    1    1   1  -> 0x07F -> Enable AVX and lower variants
+#   0     0    0    1      1    1    1    1   1  -> 0x03F  -> Enable SSE4_1 and lower variants
+#   0     0    0    0      1    1    1    1   1  -> 0x01F  -> Enable SSSE3 and lower variants
+#   0     0    0    0      0    1    1    1   1  -> 0x00F  -> Enable SSE3 and lower variants
+#   0     0    0    0      0    0    1    1   1  -> 0x007  -> Enable SSE2 and lower variants
+#   0     0    0    0      0    0    0    1   1  -> 0x003  -> Enable SSE and lower variants
+#   0     0    0    0      0    0    0    0   1  -> 0x001  -> Enable MMX
+## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "VPX_SIMD_CAPS_MASK" as
+#  all x86_64 platforms implement sse2.
+vp9_test_x86() {
+  local arch=$1
+
+  uname -m | grep -q "x86"
+  if [ $? -eq 1 ]; then
+    elog "Machine architecture is not x86 or x86_64"
+    return 0
+  fi
+
+  if [ $arch = "x86" ]; then
+    local target="x86-linux-gcc"
+  elif [ $arch = "x86_64" ]; then
+    local target="x86_64-linux-gcc"
+  fi
+
+  local x86_isa_variants="avx512f avx2 sse4_1 ssse3 sse2"
+  local configure="$LIBVPX_SOURCE_DIR/configure"
+
+  echo "Build for x86: ${target}"
+  vp9_enc_build ${target} ${configure}
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  for isa in $x86_isa_variants; do
+    has_x86_isa_extn $isa
+    if [ $? -eq 1 ]; then
+      echo "${isa} is not supported in this machine"
+      continue
+    fi
+    export VPX_SIMD_CAPS_MASK=$($isa)
+    vp9_enc_test $encoder ${target}
+    if [ $? -eq 1 ]; then
+      return 1
+    fi
+    unset VPX_SIMD_CAPS_MASK
+  done
+}
+
+vp9_test_arm() {
+  local target="armv8-linux-gcc"
+  local configure="CROSS=aarch64-linux-gnu- $LIBVPX_SOURCE_DIR/configure --extra-cflags=-march=armv8.4-a \
+          --extra-cxxflags=-march=armv8.4-a"
+  echo "Build for arm64: ${target}"
+  vp9_enc_build ${target} "${configure}"
+
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}
+  if [ $? -eq 1 ]; then
+    return 1
+  fi
+}
+
+vp9_c_vs_simd_enc_test () {
+  # Test Generic
+  vp9_test_generic
+
+  # Test x86 (32 bit)
+  echo "vp9 test for x86 (32 bit): Started."
+  vp9_test_x86 "x86"
+  if [ $? -eq 1 ]; then
+    echo "vp9 test for x86 (32 bit): Done, test failed."
+  else
+    echo "vp9 test for x86 (32 bit): Done, all tests passed."
+  fi
+
+  # Test x86_64 (64 bit)
+  if [ "$(eval uname -m)" = "x86_64" ]; then
+    echo "vp9 test for x86_64 (64 bit): Started."
+    vp9_test_x86 "x86_64"
+    if [ $? -eq 1 ]; then
+      echo "vp9 test for x86_64 (64 bit): Done, test failed."
+    else
+      echo "vp9 test for x86_64 (64 bit): Done, all tests passed."
+    fi
+  fi
+
+  ##TODO(BUG=webm:1809): Enable testing for ARM after issues with NEON intrinsic
+  # are resolved.
+  # Test ARM
+  #  echo "vp9_test_arm: Started."
+  #  vp9_test_arm
+  #  if [ $? -eq 1 ]; then
+  #    echo "vp9 test for arm: Done, test failed."
+  #  else
+  #    echo "vp9 test for arm: Done, all tests passed."
+  #  fi
+}
+
+# Setup a trap function to clean up build, and output files after tests complete.
+trap cleanup EXIT
+
+vp9_c_vs_simd_enc_verify_environment
+if [ $? -eq 1 ]; then
+  echo "Environment check failed."
+  exit 1
+fi
+vp9_c_vs_simd_enc_test

From 2f2761c261a242ba696321d18976f6453a0ad822 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 1 Aug 2023 11:00:20 -0400
Subject: [PATCH 781/926] vp9 ext rc: Add callback for tpl stats

Added test

Bug: b/294049605
Change-Id: I3967a0f915e1a6e7a0d34d04732c33e1ca6f35e7
---
 test/vp9_ext_ratectrl_test.cc  | 21 +++++++++++++++++++++
 vp9/encoder/vp9_ext_ratectrl.c | 15 +++++++++++++++
 vp9/encoder/vp9_ext_ratectrl.h |  4 ++++
 vp9/encoder/vp9_tpl_model.c    |  9 +++++++++
 vpx/vpx_ext_ratectrl.h         | 19 ++++++++++++++++++-
 vpx/vpx_tpl.h                  |  1 +
 6 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 739c0b7f8e..a7248bcec4 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -18,6 +18,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vp9/simple_encode.h"
 #include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 namespace {
@@ -151,6 +152,19 @@ vpx_rc_status_t rc_send_firstpass_stats_gop_short(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
+                                      const VpxTplGopStats *tpl_gop_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_GT(tpl_gop_stats->size, 0);
+
+  for (int i = 0; i < tpl_gop_stats->size; ++i) {
+    EXPECT_GT(tpl_gop_stats->frame_stats_list[i].num_blocks, 0);
+  }
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_get_encodeframe_decision(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_info_t *encode_frame_info,
@@ -679,6 +693,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       vpx_rc_funcs_t rc_funcs;
+      memset(&rc_funcs, 0, sizeof(rc_funcs));
       rc_funcs.rc_type = VPX_RC_QP;
       rc_funcs.create_model = rc_create_model;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
@@ -722,9 +737,11 @@ class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
 
       vpx_rc_funcs_t rc_funcs;
+      memset(&rc_funcs, 0, sizeof(rc_funcs));
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
+      rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats;
       rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
       rc_funcs.get_gop_decision = rc_get_gop_decision;
       rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
@@ -769,6 +786,7 @@ class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
       vpx_rc_funcs_t rc_funcs;
+      memset(&rc_funcs, 0, sizeof(rc_funcs));
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -817,6 +835,7 @@ class ExtRateCtrlTestGOPShortOverlay
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
       vpx_rc_funcs_t rc_funcs;
+      memset(&rc_funcs, 0, sizeof(rc_funcs));
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -866,6 +885,7 @@ class ExtRateCtrlTestGOPShortNoARF
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
       vpx_rc_funcs_t rc_funcs;
+      memset(&rc_funcs, 0, sizeof(rc_funcs));
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -920,6 +940,7 @@ class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       vpx_rc_funcs_t rc_funcs;
+      memset(&rc_funcs, 0, sizeof(rc_funcs));
       rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 09253403b8..6d8cf566d8 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -119,6 +119,21 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats(
   return VPX_CODEC_OK;
 }
 
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+                                         const VpxTplGopStats *tpl_gop_stats) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) {
+    vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats(
+        ext_ratectrl->model, tpl_gop_stats);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
 static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
   // TODO(angiebird): Add unit test to make sure this function behaves like
   // get_frame_type_from_update_type()
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 7c38758833..b04580c1d4 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -12,6 +12,7 @@
 #define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
 
 #include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
 #include "vp9/encoder/vp9_firstpass.h"
 
 typedef struct EXT_RATECTRL {
@@ -34,6 +35,9 @@ vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
 vpx_codec_err_t vp9_extrc_send_firstpass_stats(
     EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
 
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+                                         const VpxTplGopStats *tpl_gop_stats);
+
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
     FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 0a81175f73..c45404c256 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -1505,6 +1505,15 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
   // Qmode.
   trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
 
+  if (cpi->ext_ratectrl.ready) {
+    const vpx_codec_err_t codec_status =
+        vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cpi->common.error, codec_status,
+                         "vp9_extrc_send_tpl_stats() failed");
+    }
+  }
+
 #if CONFIG_NON_GREEDY_MV
   cpi->tpl_ready = 1;
 #if DUMP_TPL_STATS
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 2c312858b8..1c67c8deb4 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -16,6 +16,7 @@ extern "C" {
 #endif
 
 #include "./vpx_integer.h"
+#include "vpx/vpx_tpl.h"
 
 /*!\brief Current ABI version number
  *
@@ -25,7 +26,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (6)
+#define VPX_EXT_RATECTRL_ABI_VERSION (7)
 
 /*!\brief The control type of the inference API.
  * In VPX_RC_QP mode, the external rate control model determines the
@@ -410,6 +411,18 @@ typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_firstpass_stats_t *first_pass_stats);
 
+/*!\brief Send TPL stats for the current GOP to the external rate control model
+ * callback prototype
+ *
+ * This callback is invoked by the encoder to send TPL stats for the GOP to the
+ * external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  tpl_gop_stats    TPL stats for current GOP
+ */
+typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const VpxTplGopStats *tpl_gop_stats);
+
 /*!\brief Receive encode frame decision callback prototype
  *
  * This callback is invoked by the encoder to receive encode frame decision from
@@ -491,6 +504,10 @@ typedef struct vpx_rc_funcs {
    * Send first pass stats to the external rate control model.
    */
   vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats;
+  /*!
+   * Send TPL stats for current GOP to the external rate control model.
+   */
+  vpx_rc_send_tpl_gop_stats_cb_fn_t send_tpl_gop_stats;
   /*!
    * Get encodeframe decision from the external rate control model.
    */
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
index 50aec49eb6..61473168f4 100644
--- a/vpx/vpx_tpl.h
+++ b/vpx/vpx_tpl.h
@@ -18,6 +18,7 @@
 #include <stdio.h>
 
 #include "./vpx_integer.h"
+#include "vpx/vpx_codec.h"
 
 #ifdef __cplusplus
 extern "C" {

From 44f2819298fdffd50bc877ac55e575fbff9bd541 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 3 Aug 2023 14:07:55 -0400
Subject: [PATCH 782/926] vp9_quantize_fp_neon: Same params name as in decl

Clear some clang-tidy warnings

Change-Id: Iea4c4e77b3d515ec6384bd34875a0002ab13c14c
---
 vp9/common/vp9_rtcd_defs.pl              | 2 +-
 vp9/encoder/arm/neon/vp9_quantize_neon.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 980827b15a..3ecbd5417f 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -197,7 +197,7 @@ ()
   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order" ;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
 
   # fdct functions
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index e8cb78dbfc..968cdc6d11 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -116,7 +116,7 @@ static VPX_FORCE_INLINE void quantize_fp_8(
   *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
 }
 
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const struct macroblock_plane *mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -135,7 +135,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
   // now process the rest of the ac coeffs
   update_fp_values(&v_round, &v_quant, &v_dequant);
-  for (i = 8; i < count; i += 8) {
+  for (i = 8; i < n_coeffs; i += 8) {
     quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
                   qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
   }
@@ -184,7 +184,7 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8(
   *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
 }
 
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                 const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -199,7 +199,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
   int i;
   const int16_t *iscan = scan_order->iscan;
 
-  (void)count;
+  (void)n_coeffs;
 
   // Process dc and the first seven ac coeffs.
   quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,

From f6aaad370d0d02ddea7d5361dce17c69d679c202 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 4 Aug 2023 14:18:33 -0400
Subject: [PATCH 783/926] Fix include path fpr vpx_tpl.h,vpx_ext_ratectrl.h

Bug: b/294049605
Change-Id: I6422fc4250c2192f985cce2e296a19a05934969b
---
 vpx/vpx_ext_ratectrl.h | 2 +-
 vpx/vpx_tpl.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 1c67c8deb4..b93df11cd5 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -16,7 +16,7 @@ extern "C" {
 #endif
 
 #include "./vpx_integer.h"
-#include "vpx/vpx_tpl.h"
+#include "./vpx_tpl.h"
 
 /*!\brief Current ABI version number
  *
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
index 61473168f4..30d28de50f 100644
--- a/vpx/vpx_tpl.h
+++ b/vpx/vpx_tpl.h
@@ -18,7 +18,7 @@
 #include <stdio.h>
 
 #include "./vpx_integer.h"
-#include "vpx/vpx_codec.h"
+#include "./vpx_codec.h"
 
 #ifdef __cplusplus
 extern "C" {

From fc29b8533e7c678d78dc1eb87c26076dcaef15ef Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 4 Aug 2023 16:12:29 -0400
Subject: [PATCH 784/926] Fix some clang-tidy warnings

 - Use zero initializer instead of memset to avoid including <cstring>
 - Include vpx_codec.h for vpx_codec_err_t and error codes
 - Include vpx_tpl.h for VpxTplGopStats

Change-Id: Iac5837ce2173bd945bfe8eeb401ff4dfd04fd2e1
---
 test/vp9_ext_ratectrl_test.cc  | 18 ++++++------------
 vp9/encoder/vp9_ext_ratectrl.c |  2 ++
 vp9/encoder/vp9_tpl_model.c    |  1 +
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index a7248bcec4..e0107b2d26 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -692,8 +692,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
   void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
-      vpx_rc_funcs_t rc_funcs;
-      memset(&rc_funcs, 0, sizeof(rc_funcs));
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_QP;
       rc_funcs.create_model = rc_create_model;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
@@ -736,8 +735,7 @@ class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
 
-      vpx_rc_funcs_t rc_funcs;
-      memset(&rc_funcs, 0, sizeof(rc_funcs));
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
@@ -785,8 +783,7 @@ class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
-      vpx_rc_funcs_t rc_funcs;
-      memset(&rc_funcs, 0, sizeof(rc_funcs));
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -834,8 +831,7 @@ class ExtRateCtrlTestGOPShortOverlay
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
-      vpx_rc_funcs_t rc_funcs;
-      memset(&rc_funcs, 0, sizeof(rc_funcs));
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -884,8 +880,7 @@ class ExtRateCtrlTestGOPShortNoARF
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
-      vpx_rc_funcs_t rc_funcs;
-      memset(&rc_funcs, 0, sizeof(rc_funcs));
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -939,8 +934,7 @@ class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
   void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
-      vpx_rc_funcs_t rc_funcs;
-      memset(&rc_funcs, 0, sizeof(rc_funcs));
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 6d8cf566d8..aa248f43f6 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -12,6 +12,8 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/common/vp9_common.h"
 #include "vpx_dsp/psnr.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_tpl.h"
 
 vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
   if (ext_ratectrl == NULL) {
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index c45404c256..7b75815571 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -19,6 +19,7 @@
 #include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_tpl_model.h"
+#include "vpx/vpx_codec.h"
 
 static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
                            const GF_GROUP *gf_group, int *tpl_group_frames) {

From d4b6132d2b9f65ba887a1a40029b1b5d61881470 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 7 Aug 2023 10:28:46 -0400
Subject: [PATCH 785/926] Fix more clang-tidy warnings

 - Include vpx/vpx_ext_ratectrl.h in vp9_ext_ratectrl.c
 - Include vpx/internal/vpx_codec_internal.h
 - Include <stddef.h> for NULL

Bug: b/294049605
Change-Id: Iedd8b3864da27fde1678bfa6606e6fc5630a7a09
---
 vp9/encoder/vp9_ext_ratectrl.c | 3 +++
 vp9/encoder/vp9_tpl_model.c    | 1 +
 2 files changed, 4 insertions(+)

diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index aa248f43f6..4664e8c5e2 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -8,11 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stddef.h>
+
 #include "vp9/encoder/vp9_ext_ratectrl.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/common/vp9_common.h"
 #include "vpx_dsp/psnr.h"
 #include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx/vpx_tpl.h"
 
 vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 7b75815571..903ea8d753 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -19,6 +19,7 @@
 #include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_tpl_model.h"
+#include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vpx_codec.h"
 
 static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,

From 242c7431700b75a640f2e94e876d4d7c02de8e16 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 7 Aug 2023 16:42:43 -0400
Subject: [PATCH 786/926] VP9 RC: Add pixel row/col of a TPL block

Bug: b/294049605
Change-Id: I383a88a037a2a48a5fc1b9def6f991278c3665a8
---
 vp9/encoder/vp9_tpl_model.c | 2 ++
 vpx/vpx_tpl.h               | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 903ea8d753..909b05292b 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -400,6 +400,8 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
     for (idx = 0; idx < mi_width; ++idx) {
       VpxTplBlockStats *tpl_block_stats_ptr =
           &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
+      tpl_block_stats_ptr->row = mi_row * 8;
+      tpl_block_stats_ptr->col = mi_col * 8;
       tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
       tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
       tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
index 30d28de50f..3828eb8f29 100644
--- a/vpx/vpx_tpl.h
+++ b/vpx/vpx_tpl.h
@@ -32,10 +32,12 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_TPL_ABI_VERSION (1) /**<\hideinitializer*/
+#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/
 
 /*!\brief Temporal dependency model stats for each block before propagation */
 typedef struct VpxTplBlockStats {
+  int16_t row;         /**< Pixel row of the top left corner */
+  int16_t col;         /**< Pixel col of the top left corner */
   int64_t intra_cost;  /**< Intra cost */
   int64_t inter_cost;  /**< Inter cost */
   int16_t mv_r;        /**< Motion vector row */

From 6e5fc000017f919de705d103fc4e6c1a41629c67 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Mon, 7 Aug 2023 13:10:41 -0700
Subject: [PATCH 787/926] Disable vpx_int_pro_row/col neon SIMD functions

The vpx_int_pro_row/col neon SIMD version caused a mismatch between
neon encoding vs c encoding. Disabled them for now to ensure the
correctness of VP9 encoding on the arm platform. Since these 2
functions were not used much, so this wouldn't affect the overall
encoder speed much.

BUG=webm:1800
BUG=webm:1809

Change-Id: Id1a7d542fc03d4cf9fa1039a49832abf35fb722f
---
 test/avg_test.cc             |  32 +++++----
 vpx_dsp/arm/avg_neon.c       | 130 +++++++++++++++++------------------
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   7 +-
 3 files changed, 88 insertions(+), 81 deletions(-)

diff --git a/test/avg_test.cc b/test/avg_test.cc
index dbd3309ee4..b885c4de4f 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -681,19 +681,25 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
                       make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
 
-INSTANTIATE_TEST_SUITE_P(
-    NEON, IntProRowTest,
-    ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
-                      make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
-                      make_tuple(64, &vpx_int_pro_row_neon,
-                                 &vpx_int_pro_row_c)));
-
-INSTANTIATE_TEST_SUITE_P(
-    NEON, IntProColTest,
-    ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
-                      make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
-                      make_tuple(64, &vpx_int_pro_col_neon,
-                                 &vpx_int_pro_col_c)));
+// Disabled neon optimization since it caused mismatch. See details in:
+// https://bugs.chromium.org/p/webm/issues/detail?id=1809
+// INSTANTIATE_TEST_SUITE_P(
+//    NEON, IntProRowTest,
+//    ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon,
+//    &vpx_int_pro_row_c),
+//                      make_tuple(32, &vpx_int_pro_row_neon,
+//                      &vpx_int_pro_row_c), make_tuple(64,
+//                      &vpx_int_pro_row_neon,
+//                                 &vpx_int_pro_row_c)));
+//
+// INSTANTIATE_TEST_SUITE_P(
+//    NEON, IntProColTest,
+//    ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon,
+//    &vpx_int_pro_col_c),
+//                      make_tuple(32, &vpx_int_pro_col_neon,
+//                      &vpx_int_pro_col_c), make_tuple(64,
+//                      &vpx_int_pro_col_neon,
+//                                 &vpx_int_pro_col_c)));
 
 INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
                          ::testing::Values(make_tuple(16, &vpx_satd_neon),
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 22164242c5..0cb102fdf6 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -67,71 +67,71 @@ int vpx_satd_neon(const tran_low_t *coeff, int length) {
   return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
 }
 
-void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
-                          const int ref_stride, const int height) {
-  int i;
-  uint8x16_t r0, r1, r2, r3;
-  uint16x8_t sum_lo[2], sum_hi[2];
-  uint16x8_t tmp_lo[2], tmp_hi[2];
-  int16x8_t avg_lo, avg_hi;
-
-  const int norm_factor = (height >> 5) + 3;
-  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
-
-  assert(height >= 4 && height % 4 == 0);
-
-  r0 = vld1q_u8(ref + 0 * ref_stride);
-  r1 = vld1q_u8(ref + 1 * ref_stride);
-  r2 = vld1q_u8(ref + 2 * ref_stride);
-  r3 = vld1q_u8(ref + 3 * ref_stride);
-
-  sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
-  sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
-  sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
-  sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
-
-  for (i = 4; i < height; i += 4) {
-    r0 = vld1q_u8(ref + 0 * ref_stride);
-    r1 = vld1q_u8(ref + 1 * ref_stride);
-    r2 = vld1q_u8(ref + 2 * ref_stride);
-    r3 = vld1q_u8(ref + 3 * ref_stride);
-
-    tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
-    tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
-    tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
-    tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
-
-    sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
-    sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
-    sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
-    sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
-
-    ref += 4 * ref_stride;
-  }
-
-  sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
-  sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
-
-  avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
-  avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
-
-  vst1q_s16(hbuf, avg_lo);
-  vst1q_s16(hbuf + 8, avg_hi);
-}
-
-int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
-  uint16x8_t sum;
-  int i;
-
-  assert(width >= 16 && width % 16 == 0);
-
-  sum = vpaddlq_u8(vld1q_u8(ref));
-  for (i = 16; i < width; i += 16) {
-    sum = vpadalq_u8(sum, vld1q_u8(ref + i));
-  }
-
-  return (int16_t)horizontal_add_uint16x8(sum);
-}
+// void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+//                           const int ref_stride, const int height) {
+//   int i;
+//   uint8x16_t r0, r1, r2, r3;
+//   uint16x8_t sum_lo[2], sum_hi[2];
+//   uint16x8_t tmp_lo[2], tmp_hi[2];
+//   int16x8_t avg_lo, avg_hi;
+//
+//   const int norm_factor = (height >> 5) + 3;
+//   const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+//
+//   assert(height >= 4 && height % 4 == 0);
+//
+//   r0 = vld1q_u8(ref + 0 * ref_stride);
+//   r1 = vld1q_u8(ref + 1 * ref_stride);
+//   r2 = vld1q_u8(ref + 2 * ref_stride);
+//   r3 = vld1q_u8(ref + 3 * ref_stride);
+//
+//   sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+//   sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+//   sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+//   sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+//
+//   for (i = 4; i < height; i += 4) {
+//     r0 = vld1q_u8(ref + 0 * ref_stride);
+//     r1 = vld1q_u8(ref + 1 * ref_stride);
+//     r2 = vld1q_u8(ref + 2 * ref_stride);
+//     r3 = vld1q_u8(ref + 3 * ref_stride);
+//
+//     tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+//     tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+//     tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+//     tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+//
+//     sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
+//     sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
+//     sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
+//     sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
+//
+//     ref += 4 * ref_stride;
+//   }
+//
+//   sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+//   sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+//
+//   avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+//   avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+//
+//   vst1q_s16(hbuf, avg_lo);
+//   vst1q_s16(hbuf + 8, avg_hi);
+// }
+
+// int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+//   uint16x8_t sum;
+//   int i;
+//
+//   assert(width >= 16 && width % 16 == 0);
+//
+//   sum = vpaddlq_u8(vld1q_u8(ref));
+//   for (i = 16; i < width; i += 16) {
+//     sum = vpadalq_u8(sum, vld1q_u8(ref + i));
+//   }
+//
+//   return (int16_t)horizontal_add_uint16x8(sum);
+// }
 
 // ref, src = [0, 510] - max diff = 16-bits
 // bwl = {2, 3, 4}, width = {16, 32, 64}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 0f577398cf..798bd93890 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -876,11 +876,12 @@ ()
     specialize qw/vpx_satd avx2 sse2 neon msa/;
   }
 
+  # Disabled neon optimization since it caused mismatch. See details in:
+  # https://bugs.chromium.org/p/webm/issues/detail?id=1809
   add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/vpx_int_pro_row sse2 neon msa/;
-
+  specialize qw/vpx_int_pro_row sse2 msa/;
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/vpx_int_pro_col sse2 neon msa/;
+  specialize qw/vpx_int_pro_col sse2 msa/;
 
   add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   specialize qw/vpx_vector_var neon sse2 msa/;

From 685715b698e635f1152646fa48711f418a4082f7 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Mon, 7 Aug 2023 14:54:30 -0700
Subject: [PATCH 788/926] Enable arm test in c vs SIMD bit-exactness test

Arm SIMD testing was enabled in c vs SIMD bit-exactness test after
arm SIMD mismatch was resolved.

BUG=webm:1800

Change-Id: Id60127313a0955f4a5c8468281fd5a441668fddb
---
 test/vp9_c_vs_simd_encode.sh | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 test/vp9_c_vs_simd_encode.sh

diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
old mode 100644
new mode 100755
index 76df049d9a..e3d3624ed4
--- a/test/vp9_c_vs_simd_encode.sh
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -266,6 +266,11 @@ vp9_enc_test() {
       return 1
     fi
 
+    # Enable armv8 test for real-time only
+    if [ "${preset}" = "good" ] && [ "${target}" = "armv8-linux-gcc" ]; then
+      continue
+    fi
+
     for cpu in $(seq 0 $max_cpu_used); do
       for clip in ${TEST_CLIPS}; do
         for bitrate in ${TEST_BITRATES}; do
@@ -388,16 +393,14 @@ vp9_c_vs_simd_enc_test () {
     fi
   fi
 
-  ##TODO(BUG=webm:1809): Enable testing for ARM after issues with NEON intrinsic
-  # are resolved.
   # Test ARM
-  #  echo "vp9_test_arm: Started."
-  #  vp9_test_arm
-  #  if [ $? -eq 1 ]; then
-  #    echo "vp9 test for arm: Done, test failed."
-  #  else
-  #    echo "vp9 test for arm: Done, all tests passed."
-  #  fi
+  echo "vp9_test_arm: Started."
+  vp9_test_arm
+  if [ $? -eq 1 ]; then
+    echo "vp9 test for arm: Done, test failed."
+  else
+    echo "vp9 test for arm: Done, all tests passed."
+  fi
 }
 
 # Setup a trap function to clean up build, and output files after tests complete.

From c8610c266c7f1a098b304709936f3b0782d36fc6 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 10 Aug 2023 15:33:54 +0100
Subject: [PATCH 789/926] Fix bug and re-enable vpx_int_pro_row/col_neon

Fix a bug in vpx_int_pro_row_neon (increment pointer after peeled
first loop iteration) and re-enable both vpx_int_pro_row/col_neon
paths.

Also fix IntProRowTest to use width_ (instead of 0) as the src_stride
for the input data block. The test's use of 0 for src_stride is the
reason the tests passed with the buggy Neon implementation noted in
the listed bugs. (The old buggy Neon implementation fails the
adjusted unit tests.)

BUG=webm:1800
BUG=webm:1809

Change-Id: I1f4572ee155653a7596fe2c10b5938ea7a3f63ae
---
 test/avg_test.cc             |  37 +++++-----
 vpx_dsp/arm/avg_neon.c       | 132 ++++++++++++++++++-----------------
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   6 +-
 3 files changed, 85 insertions(+), 90 deletions(-)

diff --git a/test/avg_test.cc b/test/avg_test.cc
index b885c4de4f..ede9c0ba8c 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -190,8 +190,9 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   }
 
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_));
+    ASM_REGISTER_STATE_CHECK(
+        asm_func_(hbuf_asm_, source_data_, width_, height_));
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
         << "Output mismatch";
   }
@@ -681,25 +682,19 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
                       make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
 
-// Disabled neon optimization since it caused mismatch. See details in:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1809
-// INSTANTIATE_TEST_SUITE_P(
-//    NEON, IntProRowTest,
-//    ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon,
-//    &vpx_int_pro_row_c),
-//                      make_tuple(32, &vpx_int_pro_row_neon,
-//                      &vpx_int_pro_row_c), make_tuple(64,
-//                      &vpx_int_pro_row_neon,
-//                                 &vpx_int_pro_row_c)));
-//
-// INSTANTIATE_TEST_SUITE_P(
-//    NEON, IntProColTest,
-//    ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon,
-//    &vpx_int_pro_col_c),
-//                      make_tuple(32, &vpx_int_pro_col_neon,
-//                      &vpx_int_pro_col_c), make_tuple(64,
-//                      &vpx_int_pro_col_neon,
-//                                 &vpx_int_pro_col_c)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProRowTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+                      make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+                      make_tuple(64, &vpx_int_pro_row_neon,
+                                 &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProColTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+                      make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+                      make_tuple(64, &vpx_int_pro_col_neon,
+                                 &vpx_int_pro_col_c)));
 
 INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
                          ::testing::Values(make_tuple(16, &vpx_satd_neon),
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 0cb102fdf6..1b17a326b4 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -67,71 +67,73 @@ int vpx_satd_neon(const tran_low_t *coeff, int length) {
   return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
 }
 
-// void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
-//                           const int ref_stride, const int height) {
-//   int i;
-//   uint8x16_t r0, r1, r2, r3;
-//   uint16x8_t sum_lo[2], sum_hi[2];
-//   uint16x8_t tmp_lo[2], tmp_hi[2];
-//   int16x8_t avg_lo, avg_hi;
-//
-//   const int norm_factor = (height >> 5) + 3;
-//   const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
-//
-//   assert(height >= 4 && height % 4 == 0);
-//
-//   r0 = vld1q_u8(ref + 0 * ref_stride);
-//   r1 = vld1q_u8(ref + 1 * ref_stride);
-//   r2 = vld1q_u8(ref + 2 * ref_stride);
-//   r3 = vld1q_u8(ref + 3 * ref_stride);
-//
-//   sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
-//   sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
-//   sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
-//   sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
-//
-//   for (i = 4; i < height; i += 4) {
-//     r0 = vld1q_u8(ref + 0 * ref_stride);
-//     r1 = vld1q_u8(ref + 1 * ref_stride);
-//     r2 = vld1q_u8(ref + 2 * ref_stride);
-//     r3 = vld1q_u8(ref + 3 * ref_stride);
-//
-//     tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
-//     tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
-//     tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
-//     tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
-//
-//     sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
-//     sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
-//     sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
-//     sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
-//
-//     ref += 4 * ref_stride;
-//   }
-//
-//   sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
-//   sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
-//
-//   avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
-//   avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
-//
-//   vst1q_s16(hbuf, avg_lo);
-//   vst1q_s16(hbuf + 8, avg_hi);
-// }
-
-// int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
-//   uint16x8_t sum;
-//   int i;
-//
-//   assert(width >= 16 && width % 16 == 0);
-//
-//   sum = vpaddlq_u8(vld1q_u8(ref));
-//   for (i = 16; i < width; i += 16) {
-//     sum = vpadalq_u8(sum, vld1q_u8(ref + i));
-//   }
-//
-//   return (int16_t)horizontal_add_uint16x8(sum);
-// }
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  uint8x16_t r0, r1, r2, r3;
+  uint16x8_t sum_lo[2], sum_hi[2];
+  uint16x8_t tmp_lo[2], tmp_hi[2];
+  int16x8_t avg_lo, avg_hi;
+
+  const int norm_factor = (height >> 5) + 3;
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+
+  assert(height >= 4 && height % 4 == 0);
+
+  r0 = vld1q_u8(ref + 0 * ref_stride);
+  r1 = vld1q_u8(ref + 1 * ref_stride);
+  r2 = vld1q_u8(ref + 2 * ref_stride);
+  r3 = vld1q_u8(ref + 3 * ref_stride);
+
+  sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+  sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+  sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+  sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+  ref += 4 * ref_stride;
+
+  for (i = 4; i < height; i += 4) {
+    r0 = vld1q_u8(ref + 0 * ref_stride);
+    r1 = vld1q_u8(ref + 1 * ref_stride);
+    r2 = vld1q_u8(ref + 2 * ref_stride);
+    r3 = vld1q_u8(ref + 3 * ref_stride);
+
+    tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+    tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+    tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+    tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+    sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
+    sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
+    sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
+    sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
+
+    ref += 4 * ref_stride;
+  }
+
+  sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+  sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+  avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+  avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+  vst1q_s16(hbuf, avg_lo);
+  vst1q_s16(hbuf + 8, avg_hi);
+}
+
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  uint16x8_t sum;
+  int i;
+
+  assert(width >= 16 && width % 16 == 0);
+
+  sum = vpaddlq_u8(vld1q_u8(ref));
+  for (i = 16; i < width; i += 16) {
+    sum = vpadalq_u8(sum, vld1q_u8(ref + i));
+  }
+
+  return (int16_t)horizontal_add_uint16x8(sum);
+}
 
 // ref, src = [0, 510] - max diff = 16-bits
 // bwl = {2, 3, 4}, width = {16, 32, 64}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 798bd93890..8033b4a81a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -876,12 +876,10 @@ ()
     specialize qw/vpx_satd avx2 sse2 neon msa/;
   }
 
-  # Disabled neon optimization since it caused mismatch. See details in:
-  # https://bugs.chromium.org/p/webm/issues/detail?id=1809
   add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/vpx_int_pro_row sse2 msa/;
+  specialize qw/vpx_int_pro_row neon sse2 msa/;
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/vpx_int_pro_col sse2 msa/;
+  specialize qw/vpx_int_pro_col neon sse2 msa/;
 
   add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   specialize qw/vpx_vector_var neon sse2 msa/;

From 335728c987b3164ff25c58c06d29eb49e19e21d4 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 11 Aug 2023 13:40:07 -0700
Subject: [PATCH 790/926] *quantize*.c: fix visual studio warnings

after:
22818907d normalize *const in rtcd

fixes warnings of the form:
vpx_dsp\x86\quantize_avx.c(145): warning C4028: formal parameter 2
different from declaration

Change-Id: I4dc423f11ec4a9171e18bdb6be2fa8dfb65ee61a
---
 vp9/encoder/x86/vp9_quantize_avx2.c  | 8 ++++----
 vp9/encoder/x86/vp9_quantize_sse2.c  | 2 +-
 vp9/encoder/x86/vp9_quantize_ssse3.c | 4 ++--
 vpx_dsp/quantize.c                   | 4 ++--
 vpx_dsp/x86/quantize_avx.c           | 4 ++--
 vpx_dsp/x86/quantize_avx2.c          | 4 ++--
 vpx_dsp/x86/quantize_ssse3.c         | 4 ++--
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index 62af3a9212..bf44b08674 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -102,7 +102,7 @@ static VPX_FORCE_INLINE void quantize_fp_16(
 }
 
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const struct macroblock_plane *mb_plane,
+                          const struct macroblock_plane *const mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const struct ScanOrder *const scan_order) {
@@ -206,7 +206,7 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_16(
 }
 
 void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const struct macroblock_plane *mb_plane,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const struct ScanOrder *const scan_order) {
@@ -326,7 +326,7 @@ static VPX_FORCE_INLINE void highbd_quantize_fp(
 }
 
 void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const struct macroblock_plane *mb_plane,
+                                 const struct macroblock_plane *const mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -391,7 +391,7 @@ static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
 
 void vp9_highbd_quantize_fp_32x32_avx2(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-    const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+    const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const struct ScanOrder *const scan_order) {
   const int step = 8;
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 67f03eb310..2481eb366e 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -21,7 +21,7 @@
 #include "vp9/encoder/vp9_block.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const struct macroblock_plane *mb_plane,
+                          const struct macroblock_plane *const mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const struct ScanOrder *const scan_order) {
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c
index c94c8dbb4c..98decae749 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.c
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -21,7 +21,7 @@
 #include "vp9/encoder/vp9_block.h"
 
 void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           const struct macroblock_plane *mb_plane,
+                           const struct macroblock_plane *const mb_plane,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
                            const struct ScanOrder *const scan_order) {
@@ -120,7 +120,7 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const struct macroblock_plane *mb_plane,
+                                 const struct macroblock_plane *const mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index dee12bae76..fac9136f8c 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -166,7 +166,7 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const struct macroblock_plane *mb_plane,
+                             const struct macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const struct ScanOrder *const scan_order) {
@@ -214,7 +214,7 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                            const struct macroblock_plane *mb_plane,
+                            const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const struct ScanOrder *const scan_order) {
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 98bf1686cb..5ff5abc110 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -23,7 +23,7 @@
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        const struct macroblock_plane *mb_plane,
+                        const struct macroblock_plane *const mb_plane,
                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
                         const struct ScanOrder *const scan_order) {
@@ -139,7 +139,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
-                              const struct macroblock_plane *mb_plane,
+                              const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
                               const struct ScanOrder *const scan_order) {
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 189b083f68..d4872f6bca 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -156,7 +156,7 @@ static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
 }
 
 void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const struct macroblock_plane *mb_plane,
+                         const struct macroblock_plane *const mb_plane,
                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
                          const struct ScanOrder *const scan_order) {
@@ -253,7 +253,7 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
 }
 
 void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
-                               const struct macroblock_plane *mb_plane,
+                               const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const struct ScanOrder *const scan_order) {
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 7f085566dd..2c6d851a16 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -20,7 +20,7 @@
 #include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const struct macroblock_plane *mb_plane,
+                          const struct macroblock_plane *const mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const struct ScanOrder *const scan_order) {
@@ -106,7 +106,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
-                                const struct macroblock_plane *mb_plane,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const struct ScanOrder *const scan_order) {

From 8d2c357eab390923657113ead4567f70a026daf0 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 11 Aug 2023 15:48:19 -0700
Subject: [PATCH 791/926] fdct4x4_neon: fix compile w/cl

Use an array for constant initialization rather than array syntax which
assumes the underlying type is a vector. Fixes compile error with
cl targeting Windows Arm64:

vpx_dsp\arm\fdct4x4_neon.c(55,52): error C2078: too many initializers

No change in assembly with gcc 12.2.0 & clang 14.

Bug: b/277255390
Bug: webm:1810
Fixed: webm:1810

Change-Id: Ia30edcdbb45067dfe865b9958a5eecf1fd9ddfc8
---
 vpx_dsp/arm/fdct4x4_neon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
index 3b9196fae9..4bc968ecba 100644
--- a/vpx_dsp/arm/fdct4x4_neon.c
+++ b/vpx_dsp/arm/fdct4x4_neon.c
@@ -52,7 +52,6 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
 
 void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                              int stride) {
-  static const int32x4_t const_1000 = { 1, 0, 0, 0 };
   const int32x4_t const_one = vdupq_n_s32(1);
 
   // input[M * stride] * 16
@@ -64,7 +63,8 @@ void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
 
   // If the very first value != 0, then add 1.
   if (input[0] != 0) {
-    in[0] = vaddq_s32(in[0], const_1000);
+    static const int32_t k1000[4] = { 1, 0, 0, 0 };
+    in[0] = vaddq_s32(in[0], vld1q_s32(k1000));
   }
 
   vpx_highbd_fdct4x4_pass1_neon(in);

From 6e2c3b9b3c529f7a6b8f3092cd39cea83f4220f0 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 15 Aug 2023 14:39:16 -0400
Subject: [PATCH 792/926] Add RC mode to vpx external RC interface

Bug: b/295507002
Change-Id: Id2dd21482828ec64eef9abdf6a1cca83100d21ba
---
 vp9/vp9_cx_iface.c     | 11 ++++++++++-
 vpx/vpx_ext_ratectrl.h | 14 ++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index cc2ae20d27..77d3fb7684 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1949,7 +1949,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
   // TODO(angiebird): Check the possibility of this flag being set at pass == 1
   if (oxcf->pass == 2) {
     const FRAME_INFO *frame_info = &cpi->frame_info;
-    vpx_rc_config_t ratectrl_config;
+    vpx_rc_config_t ratectrl_config = {};
     vpx_codec_err_t codec_status;
 
     ratectrl_config.frame_width = frame_info->frame_width;
@@ -1962,6 +1962,15 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
     ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
     ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
 
+    if (oxcf->rc_mode == VPX_VBR) {
+      ratectrl_config.rc_mode = VPX_RC_VBR;
+      ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
+      ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+    } else if (oxcf->rc_mode == VPX_Q) {
+      ratectrl_config.rc_mode = VPX_RC_QMODE;
+    } else {
+      return VPX_CODEC_INVALID_PARAM;
+    }
     codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
     if (codec_status != VPX_CODEC_OK) {
       return codec_status;
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index b93df11cd5..d755cff47b 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -48,6 +48,13 @@ typedef enum vpx_rc_type {
   VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT
 } vpx_rc_type_t;
 
+/*!\brief The rate control mode for the external rate control model.
+ */
+typedef enum vpx_ext_rc_mode {
+  VPX_RC_QMODE = 0,
+  VPX_RC_VBR = 1,
+} vpx_ext_rc_mode_t;
+
 /*!\brief Abstract rate control model handler
  *
  * The encoder will receive the model handler from create_model() defined in
@@ -305,6 +312,13 @@ typedef struct vpx_rc_config {
   int target_bitrate_kbps;
   int frame_rate_num; /**< numerator of frame rate */
   int frame_rate_den; /**< denominator of frame rate */
+  /*!
+   * The following fields are only for external rate control models that support
+   * different rate control modes.
+   */
+  vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */
+  int overshoot_percent;     /**< for VBR mode only */
+  int undershoot_percent;    /**< for VBR mode only */
 } vpx_rc_config_t;
 
 /*!\brief Information passed to the external rate control model to

From 58eed626d8c36f595de416fa4defa189eec8f831 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 16 Aug 2023 11:15:55 -0700
Subject: [PATCH 793/926] tools_common,die_codec(): output to stderr

This function is used to report a failure, messages of this type should
go to stderr.

Change-Id: I0dee246dddc886a3278b247a770a356446658864
---
 tools_common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools_common.c b/tools_common.c
index 0de15558dd..0fcab2cf29 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -77,8 +77,8 @@ void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
 void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
   const char *detail = vpx_codec_error_detail(ctx);
 
-  printf("%s: %s\n", s, vpx_codec_error(ctx));
-  if (detail) printf("    %s\n", detail);
+  fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx));
+  if (detail) fprintf(stderr, "    %s\n", detail);
   exit(EXIT_FAILURE);
 }
 

From 4b1ac3c23fba1d0eb38f0f2153017f4d13277039 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 16 Aug 2023 16:05:01 -0400
Subject: [PATCH 794/926] Extend ext RC mode to have CQ mode

Also do not return error if it's not specified.

Bug: b/295507002
Change-Id: Ib1f83551272bdde1bceff03554abc4c02d95ca09
---
 vp9/vp9_cx_iface.c     | 5 +++--
 vpx/vpx_ext_ratectrl.h | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 77d3fb7684..a9f7431ba6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1968,9 +1968,10 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
       ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
     } else if (oxcf->rc_mode == VPX_Q) {
       ratectrl_config.rc_mode = VPX_RC_QMODE;
-    } else {
-      return VPX_CODEC_INVALID_PARAM;
+    } else if (oxcf->rc_mode == VPX_CQ) {
+      ratectrl_config.rc_mode = VPX_RC_CQ;
     }
+
     codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
     if (codec_status != VPX_CODEC_OK) {
       return codec_status;
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index d755cff47b..ef96be6fff 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -53,6 +53,7 @@ typedef enum vpx_rc_type {
 typedef enum vpx_ext_rc_mode {
   VPX_RC_QMODE = 0,
   VPX_RC_VBR = 1,
+  VPX_RC_CQ = 2,
 } vpx_ext_rc_mode_t;
 
 /*!\brief Abstract rate control model handler

From 87a467f35648581109394705032216915d9ed36e Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 17 Aug 2023 12:34:48 -0400
Subject: [PATCH 795/926] vp9 ext rc: Assign over/undershoot % for CQ mode

Bug: b/295507002
Change-Id: Ie5b4dabc620f6d17c4039f186e0709d8e9479b47
---
 vp9/vp9_cx_iface.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index a9f7431ba6..4c3169f86e 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1970,6 +1970,8 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
       ratectrl_config.rc_mode = VPX_RC_QMODE;
     } else if (oxcf->rc_mode == VPX_CQ) {
       ratectrl_config.rc_mode = VPX_RC_CQ;
+      ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
+      ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
     }
 
     codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);

From 401d8f36beb3fa395457d01640407ab194d457d6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 18 Aug 2023 09:10:00 -0700
Subject: [PATCH 796/926] vp9_cx_iface: fix code compatibility

Remove '= {}' (C23 [1]) and use memset to clear a vpx_rc_config_t
instance.

after:
6e2c3b9b3 Add RC mode to vpx external RC interface

Fixes compile with -pedantic and Microsoft's cl compiler.

[1] https://en.cppreference.com/w/c/language/initialization

Change-Id: I2019cdf0c42103cfc80b1e58c68b7596e497007f
---
 vp9/vp9_cx_iface.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 4c3169f86e..f5f246406e 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1949,8 +1949,9 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
   // TODO(angiebird): Check the possibility of this flag being set at pass == 1
   if (oxcf->pass == 2) {
     const FRAME_INFO *frame_info = &cpi->frame_info;
-    vpx_rc_config_t ratectrl_config = {};
+    vpx_rc_config_t ratectrl_config;
     vpx_codec_err_t codec_status;
+    memset(&ratectrl_config, 0, sizeof(ratectrl_config));
 
     ratectrl_config.frame_width = frame_info->frame_width;
     ratectrl_config.frame_height = frame_info->frame_height;

From 80b1b5a7e99cd31d2962ba5b12d3d03d58ffe2ad Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 18 Aug 2023 12:36:45 -0700
Subject: [PATCH 797/926] vp8,ratectrl.c: fix integer overflow

in calc_iframe_target_size():
vp8/encoder/ratectrl.c:349:31: runtime error: signed integer overflow:
38 * 343597280 cannot be represented in type 'int'

Bug: chromium:1473473
Change-Id: Ie8f7b147efb27c92314df09837b66f7d97046883
---
 vp8/encoder/ratectrl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 49ab4aa238..6f14322fdc 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -346,7 +346,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
     /* Minimal target size is |2* per_frame_bandwidth|. */
     if (kf_boost < 16) kf_boost = 16;
 
-    target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    target = ((uint64_t)(16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    target = VPXMIN(INT_MAX, target);
   }
 
   if (cpi->oxcf.rc_max_intra_bitrate_pct) {

From c7aa75ac5593cafe98073444cb29ffdc1ecba3e3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 18 Aug 2023 14:04:33 -0700
Subject: [PATCH 798/926] vp9_calc_pframe_target_size_one_pass_cbr: fix int
 overflow

vp9/encoder/vp9_ratectrl.c:2171:23: runtime error: signed integer
overflow: 103079280 * -22 cannot be represented in type 'int'

Bug: chromium:1473268
Change-Id: Ic1de7d48e74d94c2a992e53ec4382b5b44dba7af
---
 vp9/encoder/vp9_ratectrl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index c32745b4f8..16c47525fb 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2168,12 +2168,12 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   if (diff > 0) {
     // Lower the target bandwidth for this frame.
     const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
-    target -= (target * pct_low) / 200;
+    target -= (int)(((int64_t)target * pct_low) / 200);
   } else if (diff < 0) {
     // Increase the target bandwidth for this frame.
     const int pct_high =
         (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
-    target += (target * pct_high) / 200;
+    target += (int)(((int64_t)target * pct_high) / 200);
   }
   if (oxcf->rc_max_inter_bitrate_pct) {
     const int max_rate =

From ade6905e3919322d6f324e8f66a85fc53029c545 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 17 Aug 2023 14:56:35 -0400
Subject: [PATCH 799/926] vp9 ext rc: copy under/overshoot% for all RC modes

Bug: b/295507002
Change-Id: Ie4b302b82fa2d83e0be450cea60c59907b37f954
---
 vp9/vp9_cx_iface.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f5f246406e..dfc02deb2f 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1962,17 +1962,15 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
     ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000);
     ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
     ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
+    ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
+    ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
 
     if (oxcf->rc_mode == VPX_VBR) {
       ratectrl_config.rc_mode = VPX_RC_VBR;
-      ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
-      ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
     } else if (oxcf->rc_mode == VPX_Q) {
       ratectrl_config.rc_mode = VPX_RC_QMODE;
     } else if (oxcf->rc_mode == VPX_CQ) {
       ratectrl_config.rc_mode = VPX_RC_CQ;
-      ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
-      ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
     }
 
     codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);

From e052ada7801c458f9fc0c2818f1be814f86e94a4 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 25 Aug 2023 10:56:23 -0400
Subject: [PATCH 800/926] Do not call ext rc functions when they're null

Change-Id: Ie78afadd4ad5845e42bd4d5412703369f8d5e0f5
---
 vp9/encoder/vp9_encoder.c   | 9 ++++++---
 vp9/encoder/vp9_firstpass.c | 6 ++++--
 vp9/encoder/vp9_tpl_model.c | 3 ++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index aaf42a2a3f..869d557dd3 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4563,7 +4563,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 #endif  // CONFIG_RATE_CTRL
     if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
-        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+        cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
@@ -5575,7 +5576,8 @@ static void encode_frame_to_data_rate(
   // Backup to ensure consistency between recodes
   save_encode_params(cpi);
   if (cpi->ext_ratectrl.ready &&
-      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) {
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) {
     vpx_codec_err_t codec_status;
     const GF_GROUP *gf_group = &cpi->twopass.gf_group;
     FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
@@ -5693,7 +5695,8 @@ static void encode_frame_to_data_rate(
   end_timing(cpi, vp9_pack_bitstream_time);
 #endif
 
-  if (cpi->ext_ratectrl.ready) {
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) {
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
     vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index bd203f1e21..1e6f6f7b3b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2769,7 +2769,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
   // will be overwritten.
   if (cpi->ext_ratectrl.ready &&
-      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) {
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL) {
     vpx_codec_err_t codec_status;
     vpx_rc_gop_decision_t gop_decision;
     vpx_rc_gop_info_t gop_info;
@@ -3506,7 +3507,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   const int show_idx = cm->current_video_frame;
 
-  if (cpi->common.current_frame_coding_index == 0) {
+  if (cpi->common.current_frame_coding_index == 0 &&
+      cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) {
     const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
         &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
     if (codec_status != VPX_CODEC_OK) {
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 909b05292b..02318070c2 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -1509,7 +1509,8 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
   // Qmode.
   trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
 
-  if (cpi->ext_ratectrl.ready) {
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
     const vpx_codec_err_t codec_status =
         vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
     if (codec_status != VPX_CODEC_OK) {

From 6da1bd01d64d3d246b633bf25c766dfe751345b7 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 28 Aug 2023 12:09:30 -0700
Subject: [PATCH 801/926] vp9 svc: fix interger overflow

Overflow was happening in two places:
one in set_encoder_config(), where the input
layer_target_bitrates are converted from kbps to bps,
the other in vp9_calc_pframe_target_size_one_pass_vbr(),
where target is scaled by kf_ratio.

vp9_ratectrl.c:2039: runtime error: signed integer overflow:
-137438983 * 25 cannot be represented in type 'int'

Bug: chromium:1475943

Change-Id: I1ab0980862548c8827fae461df9a7a74425209ff
---
 vp9/encoder/vp9_ratectrl.c | 6 +++++-
 vp9/vp9_cx_iface.c         | 8 ++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 16c47525fb..fe7414687a 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2036,7 +2036,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
 int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   static const int kf_ratio = 25;
   const RATE_CONTROL *rc = &cpi->rc;
-  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  int target = rc->avg_frame_bandwidth;
+  if (target > INT_MAX / kf_ratio)
+    target = INT_MAX;
+  else
+    target = rc->avg_frame_bandwidth * kf_ratio;
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index dfc02deb2f..06c0ac1bfb 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -639,8 +639,12 @@ static vpx_codec_err_t set_encoder_config(
 
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
     for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
-      oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] =
-          1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl];
+      const int layer = sl * oxcf->ts_number_layers + tl;
+      if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000)
+        oxcf->layer_target_bitrate[layer] = INT_MAX;
+      else
+        oxcf->layer_target_bitrate[layer] =
+            1000 * cfg->layer_target_bitrate[layer];
     }
   }
   if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) {

From 7ee16bc1786674cfbd8982e43f4ee6932da464d1 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 31 Aug 2023 14:21:01 +0100
Subject: [PATCH 802/926] Simplify Neon MSE helper function params/return
 values

Simplify the parameters and return values of the Neon MSE helper
functions for both standard and high bitdepth - avoiding unused
return values.

Change-Id: I6f9208f9ce890fbe58346d9c7d9d701f28f2f90f
---
 vpx_dsp/arm/highbd_variance_neon.c | 86 +++++++++++++-----------------
 vpx_dsp/arm/variance_neon.c        | 36 +++++--------
 2 files changed, 52 insertions(+), 70 deletions(-)

diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
index 75fde676a0..e361f6f6f1 100644
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -357,8 +357,7 @@ HIGHBD_GET_VAR(16)
 static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
                                            int src_stride,
                                            const uint16_t *ref_ptr,
-                                           int ref_stride, int w, int h,
-                                           unsigned int *sse) {
+                                           int ref_stride, int w, int h) {
   uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
   int i = h;
@@ -382,8 +381,7 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
     ref_ptr += ref_stride;
   } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
-  return *sse;
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
 #if defined(__ARM_FEATURE_DOTPROD)
@@ -391,8 +389,7 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
 static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
                                             int src_stride,
                                             const uint16_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            unsigned int *sse) {
+                                            int ref_stride, int h) {
   uint32x4_t sse_u32 = vdupq_n_u32(0);
 
   int i = h / 2;
@@ -416,15 +413,13 @@ static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
     sse_u32 = vdotq_u32(sse_u32, diff, diff);
   } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(sse_u32);
-  return *sse;
+  return horizontal_add_uint32x4(sse_u32);
 }
 
 static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
                                              int src_stride,
                                              const uint16_t *ref_ptr,
-                                             int ref_stride, int h,
-                                             unsigned int *sse) {
+                                             int ref_stride, int h) {
   uint32x4_t sse_u32 = vdupq_n_u32(0);
 
   int i = h;
@@ -447,8 +442,7 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
     ref_ptr += ref_stride;
   } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(sse_u32);
-  return *sse;
+  return horizontal_add_uint32x4(sse_u32);
 }
 
 #else  // !defined(__ARM_FEATURE_DOTPROD)
@@ -456,51 +450,47 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
 static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
                                             int src_stride,
                                             const uint16_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            unsigned int *sse) {
-  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h,
-                             sse);
+                                            int ref_stride, int h) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h);
 }
 
 static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
                                              int src_stride,
                                              const uint16_t *ref_ptr,
-                                             int ref_stride, int h,
-                                             unsigned int *sse) {
-  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h,
-                             sse);
+                                             int ref_stride, int h) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h);
 }
 
 #endif  // defined(__ARM_FEATURE_DOTPROD)
 
-#define HIGHBD_MSE_WXH_NEON(w, h)                                       \
-  uint32_t vpx_highbd_8_mse##w##x##h##_neon(                            \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,   \
-      int ref_stride, uint32_t *sse) {                                  \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                       \
-    highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \
-    return *sse;                                                        \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_highbd_10_mse##w##x##h##_neon(                           \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,   \
-      int ref_stride, uint32_t *sse) {                                  \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                       \
-    highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse);   \
-    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                 \
-    return *sse;                                                        \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_highbd_12_mse##w##x##h##_neon(                           \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,   \
-      int ref_stride, uint32_t *sse) {                                  \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                       \
-    highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse);   \
-    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                 \
-    return *sse;                                                        \
+#define HIGHBD_MSE_WXH_NEON(w, h)                                         \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon(                              \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                          \
+  }                                                                       \
+                                                                          \
+  uint32_t vpx_highbd_10_mse##w##x##h##_neon(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                   \
+    return *sse;                                                          \
+  }                                                                       \
+                                                                          \
+  uint32_t vpx_highbd_12_mse##w##x##h##_neon(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                   \
+    return *sse;                                                          \
   }
 
 HIGHBD_MSE_WXH_NEON(16, 16)
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 69ff1cf153..f41249d4d5 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -374,8 +374,7 @@ VARIANCE_WXH_NEON(64, 64, 12)
 static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
                                            int src_stride,
                                            const unsigned char *ref_ptr,
-                                           int ref_stride, int h,
-                                           unsigned int *sse) {
+                                           int ref_stride, int h) {
   uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
 
   int i = h / 2;
@@ -398,15 +397,13 @@ static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
     sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
   } while (--i != 0);
 
-  *sse = horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
-  return *sse;
+  return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
 }
 
 static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
                                             int src_stride,
                                             const unsigned char *ref_ptr,
-                                            int ref_stride, int h,
-                                            unsigned int *sse) {
+                                            int ref_stride, int h) {
   uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
   int i = h / 2;
@@ -429,8 +426,7 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
     sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
   } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
-  return *sse;
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
@@ -451,8 +447,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
 static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
                                            int src_stride,
                                            const unsigned char *ref_ptr,
-                                           int ref_stride, int h,
-                                           unsigned int *sse) {
+                                           int ref_stride, int h) {
   uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
   int i = h / 2;
@@ -478,15 +473,13 @@ static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
     sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
   } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
-  return *sse;
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
 static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
                                             int src_stride,
                                             const unsigned char *ref_ptr,
-                                            int ref_stride, int h,
-                                            unsigned int *sse) {
+                                            int ref_stride, int h) {
   uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
   int i = h;
@@ -507,8 +500,7 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
     sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
   } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
-  return *sse;
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
@@ -538,12 +530,12 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
 
 #endif  // defined(__ARM_FEATURE_DOTPROD)
 
-#define VPX_MSE_WXH_NEON(w, h)                                              \
-  unsigned int vpx_mse##w##x##h##_neon(                                     \
-      const unsigned char *src_ptr, int src_stride,                         \
-      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) {    \
-    return vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h, \
-                               sse);                                        \
+#define VPX_MSE_WXH_NEON(w, h)                                               \
+  unsigned int vpx_mse##w##x##h##_neon(                                      \
+      const unsigned char *src_ptr, int src_stride,                          \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) {     \
+    *sse = vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+    return *sse;                                                             \
   }
 
 VPX_MSE_WXH_NEON(8, 8)

From 148d1085f79c6b4dd07d552cb51b53bd2e87a3aa Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 19 Aug 2023 12:45:36 +0100
Subject: [PATCH 803/926] Refactor and extend run-time CPU feature detection on
 Arm

1) Overhaul the Arm CPU feature detection code, taking inspiration
   from similar recent changes in libaom.
2) Add neon_dotprod and neon_i8mm arch options in the configure,
   build and unit test files, adding appropriate conditional options
   where necessary.
3) Soft-enable run-time CPU feature detection by default for both 32-
   bit and 64-bit Arm platforms.

Change-Id: I3f13317d88324acc5753394351188baa8d18a261
---
 build/make/Makefile           |   6 ++
 build/make/configure.sh       |  17 +++-
 build/make/rtcd.pl            |   2 +-
 configure                     |   9 +-
 test/test_libvpx.cc           |  24 ++++-
 vpx_ports/aarch32_cpudetect.c |  89 +++++++++++++++++
 vpx_ports/aarch64_cpudetect.c | 173 ++++++++++++++++++++++++++++++++++
 vpx_ports/arm.h               |  12 +--
 vpx_ports/arm_cpudetect.c     | 154 ------------------------------
 vpx_ports/arm_cpudetect.h     |  52 ++++++++++
 vpx_ports/vpx_ports.mk        |   7 +-
 11 files changed, 378 insertions(+), 167 deletions(-)
 create mode 100644 vpx_ports/aarch32_cpudetect.c
 create mode 100644 vpx_ports/aarch64_cpudetect.c
 delete mode 100644 vpx_ports/arm_cpudetect.c
 create mode 100644 vpx_ports/arm_cpudetect.h

diff --git a/build/make/Makefile b/build/make/Makefile
index 65ac2290c7..c2dc47ccff 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -143,6 +143,12 @@ $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
 $(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
 $(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
 
+# AARCH64
+$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod
+$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod
+$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+
 # POWER
 $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
 $(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 7b2da3c1a1..9d3cd80cb3 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -973,10 +973,23 @@ process_common_toolchain() {
   # Process architecture variants
   case ${toolchain} in
     arm*)
-      # on arm, isa versions are supersets
+      soft_enable runtime_cpu_detect
+      # Arm ISA extensions are treated as supersets.
       case ${tgt_isa} in
         arm64|armv8)
-          soft_enable neon
+          for ext in ${ARCH_EXT_LIST_AARCH64}; do
+            # Disable higher order extensions to simplify dependencies.
+            if [ "$disable_exts" = "yes" ]; then
+              if ! disabled $ext; then
+                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+                disable_feature $ext
+              fi
+            elif disabled $ext; then
+              disable_exts="yes"
+            else
+              soft_enable $ext
+            fi
+          done
           ;;
         armv7|armv7s)
           soft_enable neon
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index f4edeaad51..1a6b93d5ae 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -487,7 +487,7 @@ ()
   @ALL_ARCHS = filter(qw/neon_asm neon/);
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon/);
+  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm/);
   @REQUIRES = filter(qw/neon/);
   &require(@REQUIRES);
   arm;
diff --git a/configure b/configure
index aef65a8505..2c638e5e5a 100755
--- a/configure
+++ b/configure
@@ -252,6 +252,13 @@ ARCH_LIST="
     ppc
     loongarch
 "
+
+ARCH_EXT_LIST_AARCH64="
+    neon
+    neon_dotprod
+    neon_i8mm
+"
+
 ARCH_EXT_LIST_X86="
     mmx
     sse
@@ -271,8 +278,8 @@ ARCH_EXT_LIST_LOONGSON="
 "
 
 ARCH_EXT_LIST="
-    neon
     neon_asm
+    ${ARCH_EXT_LIST_AARCH64}
 
     mips32
     dspr2
diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index 222a83f8c7..caab2dbd01 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -12,6 +12,9 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
+#if VPX_ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
 #include "vpx_ports/x86.h"
 #endif
@@ -26,7 +29,7 @@ extern void vpx_dsp_rtcd();
 extern void vpx_scale_rtcd();
 }
 
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+#if (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64
 static void append_negative_gtest_filter(const char *str) {
   std::string filter = ::testing::FLAGS_gtest_filter;
   // Negative patterns begin with one '-' followed by a ':' separated list.
@@ -34,11 +37,28 @@ static void append_negative_gtest_filter(const char *str) {
   filter += str;
   ::testing::FLAGS_gtest_filter = filter;
 }
-#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
+#endif  // (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64
 
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
+#if !CONFIG_SHARED
+#if VPX_ARCH_AARCH64
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON_DOTPROD)) {
+    append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*");
+  }
+  if (!(caps & HAS_NEON_I8MM)) {
+    append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*");
+  }
+#elif VPX_ARCH_ARM
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON)) {
+    append_negative_gtest_filter(":NEON.*:NEON/*");
+  }
+#endif  // VPX_ARCH_ARM
+#endif  // !CONFIG_SHARED
+
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
   const int simd_caps = x86_simd_caps();
   if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
diff --git a/vpx_ports/aarch32_cpudetect.c b/vpx_ports/aarch32_cpudetect.c
new file mode 100644
index 0000000000..48bdc70f92
--- /dev/null
+++ b/vpx_ports/aarch32_cpudetect.c
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+// Feature detection code for Armv7-A / AArch32.
+
+#include "arm_cpudetect.h"
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(_MSC_VER)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON || HAVE_NEON_ASM
+  // MSVC has no inline __asm support for Arm, but it does let you __emit
+  // instructions via their assembled hex code.
+  // All of these instructions should be essentially nops.
+  __try {
+    // VORR q0,q0,q0
+    __emit(0xF2200150);
+    flags |= HAS_NEON;
+  } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+    // Ignore exception.
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON || HAVE_NEON_ASM
+  uint64_t features = android_getCpuFeatures();
+  if (features & ANDROID_CPU_ARM_FEATURE_NEON) {
+    flags |= HAS_NEON;
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define VPX_AARCH32_HWCAP_NEON (1 << 12)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON || HAVE_NEON_ASM
+  if (hwcap & VPX_AARCH32_HWCAP_NEON) {
+    flags |= HAS_NEON;
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+#else   // end __linux__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method available" \
+"for your platform. Rerun configure with --disable-runtime-cpu-detect."
+#endif
+
+int arm_cpu_caps(void) {
+  int flags = 0;
+  if (arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  return arm_get_cpu_caps() & arm_cpu_env_mask();
+}
diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
new file mode 100644
index 0000000000..a3054ad717
--- /dev/null
+++ b/vpx_ports/aarch64_cpudetect.c
@@ -0,0 +1,173 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "arm_cpudetect.h"
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+// sysctlbyname() parameter documentation for instruction set characteristics:
+// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+static INLINE int64_t have_feature(const char *feature) {
+  int64_t feature_present = 0;
+  size_t size = sizeof(feature_present);
+  if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+    return 0;
+  }
+  return feature_present;
+}
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+  if (have_feature("hw.optional.arm.FEAT_DotProd")) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (have_feature("hw.optional.arm.FEAT_I8MM")) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+  return flags;
+}
+
+#elif defined(_MSC_VER)  // end __APPLE__
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+// IsProcessorFeaturePresent() parameter documentation:
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 20348, supported by Windows 11 and Windows Server 2022.
+#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+  if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_NEON_DOTPROD
+  // No I8MM feature detection available on Windows at time of writing.
+  return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(VPX_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+  if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+  return flags;
+}
+
+#elif defined(__Fuchsia__)  // end __linux__
+
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282.
+#ifndef ZX_ARM64_FEATURE_ISA_I8MM
+#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
+#endif
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  uint32_t features;
+  zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  if (status != ZX_OK) {
+    return flags;
+  }
+#if HAVE_NEON_DOTPROD
+  if (features & ZX_ARM64_FEATURE_ISA_DP) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (features & ZX_ARM64_FEATURE_ISA_I8MM) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+  return flags;
+}
+
+#else  // end __Fuchsia__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method available" \
+"for your platform. Rerun configure with --disable-runtime-cpu-detect."
+#endif
+
+int arm_cpu_caps(void) {
+  int flags = 0;
+  if (!arm_cpu_env_flags(&flags)) {
+    flags = arm_get_cpu_caps() & arm_cpu_env_mask();
+  }
+
+  // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+  if (!(flags & HAS_NEON_DOTPROD)) {
+    flags &= ~HAS_NEON_I8MM;
+  }
+
+  return flags;
+}
diff --git a/vpx_ports/arm.h b/vpx_ports/arm.h
index 6458a2c5b0..65909d8260 100644
--- a/vpx_ports/arm.h
+++ b/vpx_ports/arm.h
@@ -17,12 +17,12 @@
 extern "C" {
 #endif
 
-/*ARMv5TE "Enhanced DSP" instructions.*/
-#define HAS_EDSP 0x01
-/*ARMv6 "Parallel" or "Media" instructions.*/
-#define HAS_MEDIA 0x02
-/*ARMv7 optional NEON instructions.*/
-#define HAS_NEON 0x04
+// Armv7-A optional Neon instructions, mandatory from Armv8.0-A.
+#define HAS_NEON (1 << 0)
+// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A.
+#define HAS_NEON_DOTPROD (1 << 1)
+// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
+#define HAS_NEON_I8MM (1 << 2)
 
 int arm_cpu_caps(void);
 
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
deleted file mode 100644
index 4f9d480ade..0000000000
--- a/vpx_ports/arm_cpudetect.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/arm.h"
-
-#ifdef WINAPI_FAMILY
-#include <winapifamily.h>
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define getenv(x) NULL
-#endif
-#endif
-
-static int arm_cpu_env_flags(int *flags) {
-  char *env;
-  env = getenv("VPX_SIMD_CAPS");
-  if (env && *env) {
-    *flags = (int)strtol(env, NULL, 0);
-    return 0;
-  }
-  *flags = 0;
-  return -1;
-}
-
-static int arm_cpu_env_mask(void) {
-  char *env;
-  env = getenv("VPX_SIMD_CAPS_MASK");
-  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
-}
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-int arm_cpu_caps(void) {
-  /* This function should actually be a no-op. There is no way to adjust any of
-   * these because the RTCD tables do not exist: the functions are called
-   * statically */
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-#if HAVE_NEON || HAVE_NEON_ASM
-  flags |= HAS_NEON;
-#endif /* HAVE_NEON  || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
-/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#ifndef WIN32_EXTRA_LEAN
-#define WIN32_EXTRA_LEAN
-#endif
-#include <windows.h>
-
-int arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-/* MSVC has no inline __asm support for ARM, but it does let you __emit
- *  instructions via their assembled hex code.
- * All of these instructions should be essentially nops.
- */
-#if HAVE_NEON || HAVE_NEON_ASM
-  if (mask & HAS_NEON) {
-    __try {
-      /*VORR q0,q0,q0*/
-      __emit(0xF2200150);
-      flags |= HAS_NEON;
-    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-      /*Ignore exception.*/
-    }
-  }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(__ANDROID__) /* end _MSC_VER */
-#include <cpu-features.h>
-
-int arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  uint64_t features;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  features = android_getCpuFeatures();
-
-#if HAVE_NEON || HAVE_NEON_ASM
-  if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(__linux__) /* end __ANDROID__ */
-
-#include <stdio.h>
-
-int arm_cpu_caps(void) {
-  FILE *fin;
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
-   *  on Android.
-   * This also means that detection will fail in Scratchbox.
-   */
-  fin = fopen("/proc/cpuinfo", "r");
-  if (fin != NULL) {
-    /* 512 should be enough for anybody (it's even enough for all the flags
-     * that x86 has accumulated... so far).
-     */
-    char buf[512];
-    while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_NEON || HAVE_NEON_ASM
-      if (memcmp(buf, "Features", 8) == 0) {
-        char *p;
-        p = strstr(buf, " neon");
-        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= HAS_NEON;
-        }
-      }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-    }
-    fclose(fin);
-  }
-  return flags & mask;
-}
-#else  /* end __linux__ */
-#error \
-    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
-"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
-#endif
diff --git a/vpx_ports/arm_cpudetect.h b/vpx_ports/arm_cpudetect.h
new file mode 100644
index 0000000000..24095d1acf
--- /dev/null
+++ b/vpx_ports/arm_cpudetect.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vpx_ports/arm.h"
+
+#if defined(_MSC_VER)
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#undef WIN32_EXTRA_LEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+#endif
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
+#define ANDROID_USE_CPU_FEATURES_LIB 1
+// Use getauxval() when targeting (64-bit) Android with API level >= 18.
+// getauxval() is supported since Android API level 18 (Android 4.3.)
+// First Android version with 64-bit support was Android 5.x (API level 21).
+#include <cpu-features.h>
+#endif
+
+static INLINE int arm_cpu_env_flags(int *flags) {
+  const char *env = getenv("VPX_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 1;
+  }
+  return 0;
+}
+
+static INLINE int arm_cpu_env_mask(void) {
+  const char *env = getenv("VPX_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
index e30e87cefb..93279dbebc 100644
--- a/vpx_ports/vpx_ports.mk
+++ b/vpx_ports/vpx_ports.mk
@@ -36,7 +36,12 @@ PORTS_SRCS-yes += x86.h
 PORTS_SRCS-yes += x86_abi_support.asm
 endif
 
-PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.c
+ifeq ($(VPX_ARCH_AARCH64),yes)
+PORTS_SRCS-yes += aarch64_cpudetect.c
+else
+PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c
+endif
+PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h
 PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h
 
 PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c

From 91158c99f7bc241bf70bca597f289c681c71956b Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 19 Aug 2023 15:24:12 +0100
Subject: [PATCH 804/926] Use run-time CPU feature detection for
 vpx_convolve8_neon

Arm Neon DotProd and I8MM implementations of vpx_convolve8* currently
need to be enabled at compile time since they're guarded by ifdef
feature macros. Now that run-time feature detection has been enabled
for Arm platforms, expose these implementations with distinct
*neon_dotprod/*neon_i8mm names in separate files and wire them up to
the build system and rtcd.pl. Also add new test cases for the new
DotProd and I8MM functions.

Change-Id: I3db3cd62e8596099d9fec7805ca3ee86b2a01c74
---
 test/convolve_test.cc                    |   30 +
 vpx_dsp/arm/vpx_convolve8_neon.c         | 1427 ----------------------
 vpx_dsp/arm/vpx_convolve8_neon.h         |   22 +-
 vpx_dsp/arm/vpx_convolve8_neon_dotprod.c |  777 ++++++++++++
 vpx_dsp/arm/vpx_convolve8_neon_i8mm.c    |  698 +++++++++++
 vpx_dsp/arm/vpx_convolve_neon.c          |   55 -
 vpx_dsp/arm/vpx_convolve_neon_dotprod.c  |   60 +
 vpx_dsp/arm/vpx_convolve_neon_i8mm.c     |   60 +
 vpx_dsp/vpx_dsp.mk                       |    4 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl             |   12 +-
 10 files changed, 1648 insertions(+), 1497 deletions(-)
 create mode 100644 vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
 create mode 100644 vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
 create mode 100644 vpx_dsp/arm/vpx_convolve_neon_dotprod.c
 create mode 100644 vpx_dsp/arm/vpx_convolve_neon_i8mm.c

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 4d27c5ffcf..ffd5c41c63 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1423,6 +1423,36 @@ INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest,
                          ::testing::ValuesIn(kArrayConvolve_neon));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+const ConvolveFunctions convolve8_neon_dotprod(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_dotprod,
+    vpx_convolve8_avg_horiz_neon_dotprod, vpx_convolve8_vert_neon_dotprod,
+    vpx_convolve8_avg_vert_neon_dotprod, vpx_convolve8_neon_dotprod,
+    vpx_convolve8_avg_neon_dotprod, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c,
+    vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve_neon_dotprod[] = { ALL_SIZES(
+    convolve8_neon_dotprod) };
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon_dotprod));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_NEON_I8MM
+const ConvolveFunctions convolve8_neon_i8mm(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm,
+    vpx_convolve8_avg_horiz_neon_i8mm, vpx_convolve8_vert_neon_i8mm,
+    vpx_convolve8_avg_vert_neon_i8mm, vpx_convolve8_neon_i8mm,
+    vpx_convolve8_avg_neon_i8mm, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c,
+    vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve_neon_i8mm[] = { ALL_SIZES(
+    convolve8_neon_i8mm) };
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon_i8mm));
+#endif  // HAVE_NEON_I8MM
+
 #if HAVE_DSPR2
 const ConvolveFunctions convolve8_dspr2(
     vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2,
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 505d0672f0..8b89862ba9 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -32,1429 +32,6 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-#if VPX_ARCH_AARCH64 && \
-    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
-  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
-  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
-  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
-  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
-  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
-  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
-};
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-
-void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-  assert(h % 4 == 3);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_usdot(s0, filters, perm_tbl);
-      d1 = convolve8_4_usdot(s1, filters, perm_tbl);
-      d2 = convolve8_4_usdot(s2, filters, perm_tbl);
-      d3 = convolve8_4_usdot(s3, filters, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve8_4_usdot(s0, filters, perm_tbl);
-    d1 = convolve8_4_usdot(s1, filters, perm_tbl);
-    d2 = convolve8_4_usdot(s2, filters, perm_tbl);
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
-      d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-      d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-      d2 = convolve8_8_usdot(s2, filters, perm_tbl);
-
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width > 0);
-  }
-}
-
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *filter, int x0_q4,
-                              int x_step_q4, int y0_q4, int y_step_q4, int w,
-                              int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  }
-}
-
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *filter, int x0_q4,
-                                  int x_step_q4, int y0_q4, int y_step_q4,
-                                  int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23, dd01, dd23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
-
-      d01 = vrhadd_u8(d01, dd01);
-      d23 = vrhadd_u8(d23, dd23);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
-
-        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        d0 = vrhadd_u8(d0, dd0);
-        d1 = vrhadd_u8(d1, dd1);
-        d2 = vrhadd_u8(d2, dd2);
-        d3 = vrhadd_u8(d3, dd3);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  }
-}
-
-static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
-                                        uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b = vqtbl2q_u8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
-                                        uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b0, uint8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
-}
-
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src += 7 * src_stride;
-
-    s7 = vdup_n_u8(0);
-    s8 = vdup_n_u8(0);
-    s9 = vdup_n_u8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      s7 = vdup_n_u8(0);
-      s8 = vdup_n_u8(0);
-      s9 = vdup_n_u8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filters);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filters);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filters);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filters);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23, dd01, dd23;
-
-    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src += 7 * src_stride;
-
-    s7 = vdup_n_u8(0);
-    s8 = vdup_n_u8(0);
-    s9 = vdup_n_u8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
-
-      d01 = vrhadd_u8(d01, dd01);
-      d23 = vrhadd_u8(d23, dd23);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      s7 = vdup_n_u8(0);
-      s8 = vdup_n_u8(0);
-      s9 = vdup_n_u8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filters);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filters);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filters);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filters);
-
-        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        d0 = vrhadd_u8(d0, dd0);
-        d1 = vrhadd_u8(d1, dd1);
-        d2 = vrhadd_u8(d2, dd2);
-        d3 = vrhadd_u8(d3, dd3);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-#else  // !defined(__ARM_FEATURE_MATMUL_INT8)
-
-void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-  assert(h % 4 == 3);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-      d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-      d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
-      d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-    d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-    d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
-      d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-      d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-      d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
-
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *filter, int x0_q4,
-                              int x_step_q4, int y0_q4, int y_step_q4, int w,
-                              int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  }
-}
-
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *filter, int x0_q4,
-                                  int x_step_q4, int y0_q4, int y_step_q4,
-                                  int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23, dd01, dd23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
-
-      d01 = vrhadd_u8(d01, dd01);
-      d23 = vrhadd_u8(d23, dd23);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
-
-        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        d0 = vrhadd_u8(d0, dd0);
-        d1 = vrhadd_u8(d1, dd1);
-        d2 = vrhadd_u8(d2, dd2);
-        d3 = vrhadd_u8(d3, dd3);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  }
-}
-
-static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b0,
-                                        int8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
-}
-
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    src += 7 * src_stride;
-
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      uint8x8_t t7, t8, t9, t10;
-
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s += 7 * src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filters);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filters);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filters);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filters);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23, dd01, dd23;
-
-    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    src += 7 * src_stride;
-
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      uint8x8_t t7, t8, t9, t10;
-
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
-
-      d01 = vrhadd_u8(d01, dd01);
-      d23 = vrhadd_u8(d23, dd23);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s += 7 * src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filters);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filters);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filters);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filters);
-
-        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        d0 = vrhadd_u8(d0, dd0);
-        d1 = vrhadd_u8(d1, dd1);
-        d2 = vrhadd_u8(d2, dd2);
-        d3 = vrhadd_u8(d3, dd3);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-#endif  // defined(__ARM_FEATURE_MATMUL_INT8)
-
-#else  // !(VPX_ARCH_AARCH64 &&
-       //   (defined(__ARM_FEATURE_DOTPROD) ||
-       //    defined(__ARM_FEATURE_MATMUL_INT8)))
-
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -2193,7 +770,3 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     } while (w != 0);
   }
 }
-
-#endif  // #if VPX_ARCH_AARCH64 &&
-        //     (defined(__ARM_FEATURE_DOTPROD) ||
-        //      defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 2f78583af3..025e943cc4 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -17,17 +17,15 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_filter.h"
 
-#if VPX_ARCH_AARCH64 && \
-    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h);
-#endif
-
 #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
+void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h);
+
 static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
                                                  const int32x4_t correction,
@@ -128,6 +126,12 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
 
 #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
+void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h);
+
 static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
                                                   const int8x8_t filters) {
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
new file mode 100644
index 0000000000..bf01364cf7
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -0,0 +1,777 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+  assert(h % 4 == 3);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+    d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+    d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+      d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+      d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
+                                          ptrdiff_t src_stride, uint8_t *dst,
+                                          ptrdiff_t dst_stride,
+                                          const InterpKernel *filter, int x0_q4,
+                                          int x_step_q4, int y0_q4,
+                                          int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23, dd01, dd23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
new file mode 100644
index 0000000000..e0e482e3f5
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -0,0 +1,698 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+  assert(h % 4 == 3);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      d3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+    d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+    d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+      d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+      d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *filter, int x0_q4,
+                                       int x_step_q4, int y0_q4, int y_step_q4,
+                                       int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23, dd01, dd23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index f7db3e6a9c..830f3176d7 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -14,57 +14,6 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
-#if VPX_ARCH_AARCH64 && \
-    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const InterpKernel *filter,
-                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
-                        int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
-   * maximum buffer size to 64 * (64 + 7). */
-  uint8_t temp[64 * 71];
-
-  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
-  const int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  /* Filter starting 3 lines back. */
-  vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
-                              x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                              intermediate_height);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
-                          x_step_q4, y0_q4, y_step_q4, w, h);
-}
-
-void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *filter, int x0_q4,
-                            int x_step_q4, int y0_q4, int y_step_q4, int w,
-                            int h) {
-  uint8_t temp[64 * 71];
-  const int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
-                              x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                              intermediate_height);
-
-  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
-                              x_step_q4, y0_q4, y_step_q4, w, h);
-}
-
-#else  // !(VPX_ARCH_AARCH64 &&
-       //   (defined(__ARM_FEATURE_DOTPROD) ||
-       //    defined(__ARM_FEATURE_MATMUL_INT8)))
-
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
@@ -114,7 +63,3 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
   vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
                               x_step_q4, y0_q4, y_step_q4, w, h);
 }
-
-#endif  // #if VPX_ARCH_AARCH64 &&
-        //     (defined(__ARM_FEATURE_DOTPROD) ||
-        //      defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
new file mode 100644
index 0000000000..400e26b30a
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. */
+  vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
+                                      filter, x0_q4, x_step_q4, y0_q4,
+                                      y_step_q4, w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data. */
+  vpx_convolve8_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h) {
+  uint8_t temp[64 * 71];
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
+                                      filter, x0_q4, x_step_q4, y0_q4,
+                                      y_step_q4, w, intermediate_height);
+
+  vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
+                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
new file mode 100644
index 0000000000..4d94bb79b7
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. */
+  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
+                                   filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
+                                   w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data. */
+  vpx_convolve8_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  uint8_t temp[64 * 71];
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
+                                   filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
+                                   w, intermediate_height);
+
+  vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter,
+                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 04969f37e1..8d2422b1de 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -133,6 +133,10 @@ DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
 DSP_SRCS-yes += arm/vpx_convolve8_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c
+DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c
+DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8033b4a81a..0cd21c7997 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -382,22 +382,22 @@ ()
 specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3 neon msa/;

From 02dc617f8cbd7a39ec1125b950a1d6f8ceba70f1 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 19 Aug 2023 20:21:24 +0100
Subject: [PATCH 805/926] Use run-time CPU feature detection for Neon DotProd
 SAD

Arm Neon DotProd implementations of vpx_sad* currently need to be
enabled at compile time since they're guarded by ifdef feature
macros. Now that run-time feature detection has been enabled for Arm
platforms, expose these implementations with distinct *neon_dotprod
names in separate files and wire them up to the build system and
rtcd.pl. Also add new test cases for the new DotProd functions.

Change-Id: Ic6906c28240276ba89787eadbc9393a232374f95
---
 test/sad_test.cc               |  45 ++++++
 vpx_dsp/arm/sad_neon.c         | 183 +-----------------------
 vpx_dsp/arm/sad_neon_dotprod.c | 247 +++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk             |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl   |  48 +++----
 5 files changed, 319 insertions(+), 205 deletions(-)
 create mode 100644 vpx_dsp/arm/sad_neon_dotprod.c

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 83c4fe0c36..3f9c020ee8 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1129,6 +1129,21 @@ const SadMxNParam neon_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
+#if HAVE_NEON_DOTPROD
+const SadMxNParam neon_dotprod_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_neon_dotprod),
+  SadMxNParam(64, 32, &vpx_sad64x32_neon_dotprod),
+  SadMxNParam(32, 64, &vpx_sad32x64_neon_dotprod),
+  SadMxNParam(32, 32, &vpx_sad32x32_neon_dotprod),
+  SadMxNParam(32, 16, &vpx_sad32x16_neon_dotprod),
+  SadMxNParam(16, 32, &vpx_sad16x32_neon_dotprod),
+  SadMxNParam(16, 16, &vpx_sad16x16_neon_dotprod),
+  SadMxNParam(16, 8, &vpx_sad16x8_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest,
+                         ::testing::ValuesIn(neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 const SadSkipMxNParam skip_neon_tests[] = {
   SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon),
   SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon),
@@ -1188,6 +1203,21 @@ const SadSkipMxNParam skip_neon_tests[] = {
 INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
                          ::testing::ValuesIn(skip_neon_tests));
 
+#if HAVE_NEON_DOTPROD
+const SadSkipMxNParam skip_neon_dotprod_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon_dotprod),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon_dotprod),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon_dotprod),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon_dotprod),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon_dotprod),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon_dotprod),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon_dotprod),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 const SadMxNAvgParam avg_neon_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
@@ -1246,6 +1276,21 @@ const SadMxNAvgParam avg_neon_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
 
+#if HAVE_NEON_DOTPROD
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon_dotprod),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon_dotprod),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon_dotprod),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon_dotprod),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon_dotprod),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon_dotprod),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon_dotprod),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+                         ::testing::ValuesIn(avg_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
   SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon),
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 566a1f81db..4dd87ddc0f 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -17,84 +17,6 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *ref_ptr, int ref_stride,
-                                       int w, int h) {
-  // Only two accumulators are required for optimal instruction throughput of
-  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
-      s0 = vld1q_u8(src_ptr + j);
-      r0 = vld1q_u8(ref_ptr + j);
-      diff0 = vabdq_u8(s0, r0);
-      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-      s1 = vld1q_u8(src_ptr + j + 16);
-      r1 = vld1q_u8(ref_ptr + j + 16);
-      diff1 = vabdq_u8(s1, r1);
-      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-      j += 32;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
-}
-
-static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
-}
-
-static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
-    s0 = vld1q_u8(src_ptr);
-    r0 = vld1q_u8(ref_ptr);
-    diff0 = vabdq_u8(s0, r0);
-    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    s1 = vld1q_u8(src_ptr);
-    r1 = vld1q_u8(ref_ptr);
-    diff1 = vabdq_u8(s1, r1);
-    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
@@ -186,8 +108,6 @@ static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
   return horizontal_add_uint16x8(sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *ref_ptr, int ref_stride,
                                        int h) {
@@ -280,105 +200,6 @@ SAD_SKIP_WXH_NEON(64, 64)
 
 #undef SAD_SKIP_WXH_NEON
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
-                                           int src_stride,
-                                           const uint8_t *ref_ptr,
-                                           int ref_stride, int w, int h,
-                                           const uint8_t *second_pred) {
-  // Only two accumulators are required for optimal instruction throughput of
-  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
-      s0 = vld1q_u8(src_ptr + j);
-      r0 = vld1q_u8(ref_ptr + j);
-      p0 = vld1q_u8(second_pred);
-      avg0 = vrhaddq_u8(r0, p0);
-      diff0 = vabdq_u8(s0, avg0);
-      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-      s1 = vld1q_u8(src_ptr + j + 16);
-      r1 = vld1q_u8(ref_ptr + j + 16);
-      p1 = vld1q_u8(second_pred + 16);
-      avg1 = vrhaddq_u8(r1, p1);
-      diff1 = vabdq_u8(s1, avg1);
-      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-      j += 32;
-      second_pred += 32;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
-    s0 = vld1q_u8(src_ptr);
-    r0 = vld1q_u8(ref_ptr);
-    p0 = vld1q_u8(second_pred);
-    avg0 = vrhaddq_u8(r0, p0);
-    diff0 = vabdq_u8(s0, avg0);
-    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-
-    s1 = vld1q_u8(src_ptr);
-    r1 = vld1q_u8(ref_ptr);
-    p1 = vld1q_u8(second_pred);
-    avg1 = vrhaddq_u8(r1, p1);
-    diff1 = vabdq_u8(s1, avg1);
-    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
                                             int src_stride,
                                             const uint8_t *ref_ptr,
@@ -493,8 +314,6 @@ static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_uint16x8(sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *ref_ptr,
@@ -568,3 +387,5 @@ SAD_WXH_AVG_NEON(32, 64)
 
 SAD_WXH_AVG_NEON(64, 32)
 SAD_WXH_AVG_NEON(64, 64)
+
+#undef SAD_WXH_AVG_NEON
diff --git a/vpx_dsp/arm/sad_neon_dotprod.c b/vpx_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000000..fbc0b8d75f
--- /dev/null
+++ b/vpx_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h)                                         \
+  unsigned int vpx_sad##w##x##h##_neon_dotprod(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,              \
+      int ref_stride) {                                                    \
+    return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
+  unsigned int vpx_sad_skip_##w##x##h##_neon_dotprod(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,    \
+      int ref_stride) {                                          \
+    return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+                                       2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int w, int h,
+                                                   const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
+  uint32_t vpx_sad##w##x##h##_avg_neon_dotprod(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+                                       second_pred);                          \
+  }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 8d2422b1de..d789353a1e 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -374,6 +374,7 @@ DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 0cd21c7997..8383bdd4ca 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -748,28 +748,28 @@ ()
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad64x64 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x32 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x64 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad32x32 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x16 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x32 neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad16x16 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x8 neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
@@ -787,28 +787,28 @@ ()
 specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_64x64 neon avx2 sse2/;
+specialize qw/vpx_sad_skip_64x64 neon neon_dotprod avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_64x32 neon avx2 sse2/;
+specialize qw/vpx_sad_skip_64x32 neon neon_dotprod avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_32x64 neon avx2 sse2/;
+specialize qw/vpx_sad_skip_32x64 neon neon_dotprod avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_32x32 neon avx2 sse2/;
+specialize qw/vpx_sad_skip_32x32 neon neon_dotprod avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_32x16 neon avx2 sse2/;
+specialize qw/vpx_sad_skip_32x16 neon neon_dotprod avx2 sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_16x32 neon sse2/;
+specialize qw/vpx_sad_skip_16x32 neon neon_dotprod sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_16x16 neon sse2/;
+specialize qw/vpx_sad_skip_16x16 neon neon_dotprod sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad_skip_16x8 neon sse2/;
+specialize qw/vpx_sad_skip_16x8 neon neon_dotprod sse2/;
 
 add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad_skip_8x16 neon sse2/;
@@ -886,28 +886,28 @@ ()
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad64x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad32x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x16_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x32_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x16_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x8_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;

From 7009fe55a9a7aed3a3504c09c677de0326c8207b Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 19 Aug 2023 20:59:40 +0100
Subject: [PATCH 806/926] Use run-time CPU feature detection for Neon DotProd
 SAD4D

Arm Neon DotProd implementations of vpx_sad*4d currently need to be
enabled at compile time since they're guarded by ifdef feature
macros. Now that run-time feature detection has been enabled for Arm
platforms, expose these implementations with distinct *neon_dotprod
names in separate files and wire them up to the build system and
rtcd.pl. Also add new test cases for the new DotProd functions.

Change-Id: Ie99ee0b03ec488626f52c3f13e4111fe26cc5619
---
 test/sad_test.cc                 |  30 ++++++
 vpx_dsp/arm/sad4d_neon.c         | 116 --------------------
 vpx_dsp/arm/sad4d_neon_dotprod.c | 176 +++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk               |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl     |  32 +++---
 5 files changed, 223 insertions(+), 132 deletions(-)
 create mode 100644 vpx_dsp/arm/sad4d_neon_dotprod.c

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 3f9c020ee8..3530e66050 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1346,6 +1346,21 @@ const SadMxNx4Param x4d_neon_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 
+#if HAVE_NEON_DOTPROD
+const SadMxNx4Param x4d_neon_dotprod_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon_dotprod),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon_dotprod),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon_dotprod),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon_dotprod),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon_dotprod),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon_dotprod),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon_dotprod),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test,
+                         ::testing::ValuesIn(x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
   SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon),
   SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon),
@@ -1401,6 +1416,21 @@ const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
                          ::testing::ValuesIn(skip_x4d_neon_tests));
+
+#if HAVE_NEONE_DOTPROD
+const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon_dotprod),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 3a548d0f9f..713eec7a92 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -17,120 +17,6 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
-                              uint32x4_t *const sad_sum) {
-  uint8x16_t abs_diff = vabdq_u8(src, ref);
-  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
-}
-
-static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum[4];
-
-  int i = 0;
-  do {
-    uint8x16_t s0, s1, s2, s3;
-
-    s0 = vld1q_u8(src + i * src_stride);
-    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
-    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
-    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
-    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
-
-    s1 = vld1q_u8(src + i * src_stride + 16);
-    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
-    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
-    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
-    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
-
-    s2 = vld1q_u8(src + i * src_stride + 32);
-    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
-    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
-    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
-    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
-
-    s3 = vld1q_u8(src + i * src_stride + 48);
-    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
-    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
-    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
-    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
-
-    i++;
-  } while (i < h);
-
-  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
-  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
-  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
-  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
-  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
-}
-
-static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum[4];
-
-  int i = 0;
-  do {
-    uint8x16_t s0, s1;
-
-    s0 = vld1q_u8(src + i * src_stride);
-    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
-    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
-    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
-    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
-
-    s1 = vld1q_u8(src + i * src_stride + 16);
-    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
-    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
-    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
-    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
-
-    i++;
-  } while (i < h);
-
-  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
-  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
-  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
-  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
-  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
-}
-
-static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  int i = 0;
-  do {
-    const uint8x16_t s = vld1q_u8(src + i * src_stride);
-    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
-    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
-    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
-    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
-
-    i++;
-  } while (i < h);
-
-  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD))
-
 static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
                               uint16x8_t *const sad_sum) {
   uint8x16_t abs_diff = vabdq_u8(src, ref);
@@ -229,8 +115,6 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
                              uint16x8_t *const sad_sum) {
   uint8x8_t abs_diff = vabd_u8(src, ref);
diff --git a/vpx_dsp/arm/sad4d_neon_dotprod.c b/vpx_dsp/arm/sad4d_neon_dotprod.c
new file mode 100644
index 0000000000..933fc48b8c
--- /dev/null
+++ b/vpx_dsp/arm/sad4d_neon_dotprod.c
@@ -0,0 +1,176 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h)                                      \
+  void vpx_sad##w##x##h##x4d_neon_dotprod(                                 \
+      const uint8_t *src_ptr, int src_stride,                              \
+      const uint8_t *const ref_array[4], int ref_stride,                   \
+      uint32_t sad_array[4]) {                                             \
+    sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \
+                               sad_array, (h));                            \
+  }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h)                             \
+  void vpx_sad_skip_##w##x##h##x4d_neon_dotprod(                       \
+      const uint8_t *src_ptr, int src_stride,                          \
+      const uint8_t *const ref_array[4], int ref_stride,               \
+      uint32_t sad_array[4]) {                                         \
+    sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array,     \
+                               2 * ref_stride, sad_array, ((h) >> 1)); \
+    sad_array[0] <<= 1;                                                \
+    sad_array[1] <<= 1;                                                \
+    sad_array[2] <<= 1;                                                \
+    sad_array[3] <<= 1;                                                \
+  }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index d789353a1e..feb48ee73c 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -373,6 +373,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad4d_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
 DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8383bdd4ca..ff97e68d31 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -928,28 +928,28 @@ ()
 # Multi-block SAD, comparing a reference to N independent blocks
 #
 add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad64x32x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad32x64x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad32x32x4d avx2 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x16x4d neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x32x4d neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad16x16x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x8x4d neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
@@ -967,28 +967,28 @@ ()
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_64x64x4d neon avx2 sse2/;
+specialize qw/vpx_sad_skip_64x64x4d neon neon_dotprod avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_64x32x4d neon avx2 sse2/;
+specialize qw/vpx_sad_skip_64x32x4d neon neon_dotprod avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_32x64x4d neon avx2 sse2/;
+specialize qw/vpx_sad_skip_32x64x4d neon neon_dotprod avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_32x32x4d neon avx2 sse2/;
+specialize qw/vpx_sad_skip_32x32x4d neon neon_dotprod avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_32x16x4d neon avx2 sse2/;
+specialize qw/vpx_sad_skip_32x16x4d neon neon_dotprod avx2 sse2/;
 
 add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_16x32x4d neon sse2/;
+specialize qw/vpx_sad_skip_16x32x4d neon neon_dotprod sse2/;
 
 add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_16x16x4d neon sse2/;
+specialize qw/vpx_sad_skip_16x16x4d neon neon_dotprod sse2/;
 
 add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad_skip_16x8x4d neon sse2/;
+specialize qw/vpx_sad_skip_16x8x4d neon neon_dotprod sse2/;
 
 add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad_skip_8x16x4d neon sse2/;

From ad4f28abaa7ba1ab0482ab8e844d98845961be63 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 19 Aug 2023 23:09:07 +0100
Subject: [PATCH 807/926] Use run-time feature detection for Neon DotProd
 variance

Arm Neon DotProd implementations of vpx_variance<w>x<h> currently
need to be enabled at compile time since they're guarded by #ifdef
feature macros. Now that run-time feature detection has been enabled
for Arm platforms, expose these implementations with distinct
*neon_dotprod names in a separate file and wire them up to the build
system and rtcd.pl. Also add new test cases for the new functions.

Remove the _neon suffix in functions making reference to
vpx_variance<w>x<h>_neon() (e.g. sub-pixel variance) - enabling use
of the appropriate *neon or *neon_dotprod version at run time.

Similar changes for the specialty variance and MSE functions will be
made in a subsequent commit.

Change-Id: I69a0ef0d622ecb2d15bd90b4ace53273a32ed22d
---
 test/variance_test.cc               |  18 +++
 vpx_dsp/arm/subpel_variance_neon.c  | 123 ++++++++--------
 vpx_dsp/arm/variance_neon.c         | 141 +------------------
 vpx_dsp/arm/variance_neon_dotprod.c | 211 ++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk                  |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl        |  30 ++--
 6 files changed, 308 insertions(+), 216 deletions(-)
 create mode 100644 vpx_dsp/arm/variance_neon_dotprod.c

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 6885252b82..5abbcb3647 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1475,6 +1475,24 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_neon),
                       VarianceParams(2, 2, &vpx_variance4x4_neon)));
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod),
+                      VarianceParams(6, 5, &vpx_variance64x32_neon_dotprod),
+                      VarianceParams(5, 6, &vpx_variance32x64_neon_dotprod),
+                      VarianceParams(5, 5, &vpx_variance32x32_neon_dotprod),
+                      VarianceParams(5, 4, &vpx_variance32x16_neon_dotprod),
+                      VarianceParams(4, 5, &vpx_variance16x32_neon_dotprod),
+                      VarianceParams(4, 4, &vpx_variance16x16_neon_dotprod),
+                      VarianceParams(4, 3, &vpx_variance16x8_neon_dotprod),
+                      VarianceParams(3, 4, &vpx_variance8x16_neon_dotprod),
+                      VarianceParams(3, 3, &vpx_variance8x8_neon_dotprod),
+                      VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod),
+                      VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod),
+                      VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod)));
+#endif  // HAVE_NEON_DOTPROD
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxSubpelVarianceTest,
     ::testing::Values(
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 9328c3ed89..d92f1615d7 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -143,59 +143,58 @@ static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
     return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
-#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
-  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                        \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, unsigned int *sse) {                \
-    if (xoffset == 0) {                                                       \
-      if (yoffset == 0) {                                                     \
-        return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
-                                            sse);                             \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp[w * h];                                                   \
-        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);       \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
-      } else {                                                                \
-        uint8_t tmp[w * h];                                                   \
-        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,      \
-                                    yoffset);                                 \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
-      }                                                                       \
-    } else if (xoffset == 4) {                                                \
-      uint8_t tmp0[w * (h + padding)];                                        \
-      if (yoffset == 0) {                                                     \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);               \
-        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp1[w * (h + padding)];                                      \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
-        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      } else {                                                                \
-        uint8_t tmp1[w * (h + padding)];                                      \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
-        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      }                                                                       \
-    } else {                                                                  \
-      uint8_t tmp0[w * (h + padding)];                                        \
-      if (yoffset == 0) {                                                     \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);    \
-        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp1[w * h];                                                  \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
-                                    xoffset);                                 \
-        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      } else {                                                                \
-        uint8_t tmp1[w * h];                                                  \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
-                                    xoffset);                                 \
-        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      }                                                                       \
-    }                                                                         \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                       \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
+    if (xoffset == 0) {                                                      \
+      if (yoffset == 0) {                                                    \
+        return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      } else {                                                               \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
+                                    yoffset);                                \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      }                                                                      \
+    } else if (xoffset == 4) {                                               \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    } else {                                                                 \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    }                                                                        \
   }
 
 // 4x<h> blocks are processed two rows at a time, so require an extra row of
@@ -418,53 +417,53 @@ static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
       uint8_t tmp[w * h];                                                      \
       if (yoffset == 0) {                                                      \
         avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       } else if (yoffset == 4) {                                               \
         avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
                                         source_stride, w, h, second_pred);     \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       } else {                                                                 \
         avg_pred_var_filter_block2d_bil_w##w(                                  \
             src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
       uint8_t tmp0[w * (h + padding)];                                         \
       if (yoffset == 0) {                                                      \
         avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
                                         second_pred);                          \
-        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp1[w * (h + padding)];                                       \
         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp1[w * (h + padding)];                                       \
         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
                                              second_pred);                     \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       }                                                                        \
     } else {                                                                   \
       uint8_t tmp0[w * (h + padding)];                                         \
       if (yoffset == 0) {                                                      \
         avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
                                              xoffset, second_pred);            \
-        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp1[w * h];                                                   \
         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
                                     (h + padding), xoffset);                   \
         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp1[w * h];                                                   \
         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
                                     (h + padding), xoffset);                   \
         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
                                              second_pred);                     \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       }                                                                        \
     }                                                                          \
   }
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index f41249d4d5..84a6a761f9 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -19,143 +19,6 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-// Process a block of width 4 four rows at a time.
-static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
-
-    const uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-    i -= 4;
-  } while (i != 0);
-
-  *sum = horizontal_add_int32x4(
-      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
-  *sse = horizontal_add_uint32x4(sse_u32);
-}
-
-// Process a block of width 8 two rows at a time.
-static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    const uint8x16_t s =
-        vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
-    const uint8x16_t r =
-        vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
-
-    const uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    src_ptr += 2 * src_stride;
-    ref_ptr += 2 * ref_stride;
-    i -= 2;
-  } while (i != 0);
-
-  *sum = horizontal_add_int32x4(
-      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
-  *sse = horizontal_add_uint32x4(sse_u32);
-}
-
-// Process a block of width 16 one row at a time.
-static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
-                                      const uint8_t *ref_ptr, int ref_stride,
-                                      int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    const uint8x16_t s = vld1q_u8(src_ptr);
-    const uint8x16_t r = vld1q_u8(ref_ptr);
-
-    const uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  *sum = horizontal_add_int32x4(
-      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
-  *sse = horizontal_add_uint32x4(sse_u32);
-}
-
-// Process a block of any size where the width is divisible by 16.
-static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *ref_ptr, int ref_stride,
-                                       int w, int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      const uint8x16_t s = vld1q_u8(src_ptr + j);
-      const uint8x16_t r = vld1q_u8(ref_ptr + j);
-
-      const uint8x16_t abs_diff = vabdq_u8(s, r);
-      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-      j += 16;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  *sum = horizontal_add_int32x4(
-      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
-  *sse = horizontal_add_uint32x4(sse_u32);
-}
-
-static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride, int h,
-                                      uint32_t *sse, int *sum) {
-  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
-}
-
-static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride, int h,
-                                      uint32_t *sse, int *sum) {
-  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 // Process a block of width 4 two rows at a time.
 static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride,
@@ -328,8 +191,6 @@ static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
   variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
                         unsigned int *sse, int *sum) {
@@ -369,6 +230,8 @@ VARIANCE_WXH_NEON(32, 64, 11)
 VARIANCE_WXH_NEON(64, 32, 11)
 VARIANCE_WXH_NEON(64, 64, 12)
 
+#undef VARIANCE_WXH_NEON
+
 #if defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
diff --git a/vpx_dsp/arm/variance_neon_dotprod.c b/vpx_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 0000000000..a47c355636
--- /dev/null
+++ b/vpx_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+// Process a block of width 4 four rows at a time.
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+    i -= 4;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s =
+        vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+    const uint8x16_t r =
+        vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h,
+                                               uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr + j);
+      const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+      const uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      j += 16;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+                              sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+                              sum);
+}
+
+void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                unsigned int *sse, int *sum) {
+  variance_8xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse,
+                            sum);
+}
+
+void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse, int *sum) {
+  variance_16xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse,
+                             sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift)                                \
+  unsigned int vpx_variance##w##x##h##_neon_dotprod(                          \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse,   \
+                                  &sum);                                      \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
+  }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index feb48ee73c..84fd969daa 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -418,6 +418,7 @@ DSP_SRCS-yes            += variance.h
 DSP_SRCS-$(HAVE_NEON)   += arm/avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/variance_neon_dotprod.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ff97e68d31..94a821371e 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1254,52 +1254,52 @@ ()
 # Variance
 #
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance64x64 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance64x32 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x64 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance32x32 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x16 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x32 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x16 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance8x8 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x4 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance4x8 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance4x4 sse2 neon neon_dotprod msa mmi vsx/;
 
 #
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx lsx/;
+  specialize qw/vpx_get16x16var sse2 avx2 neon neon_dotprod msa vsx lsx/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var sse2 neon msa vsx/;
+  specialize qw/vpx_get8x8var sse2 neon neon_dotprod msa vsx/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/;

From 15d621571665d8731fd980282913d9454d75c870 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 19 Aug 2023 23:25:01 +0100
Subject: [PATCH 808/926] Use run-time feature detection for Neon DotProd
 specialty var.

Enable Arm Neon DotProd implementations of vpx_get_var_sse_sum*
specialty variance functions via run-time feature detection, wiring
up the new *neon_dotprod names to rtcd.pl. Also add new test cases.

Change-Id: I04ac3db87d32ee7f94702b6c0360254e5688f713
---
 test/variance_test.cc               | 11 ++++
 vpx_dsp/arm/variance_neon.c         | 79 +-------------------------
 vpx_dsp/arm/variance_neon_dotprod.c | 87 +++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl        | 10 ++--
 4 files changed, 105 insertions(+), 82 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 5abbcb3647..c32c919760 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1476,6 +1476,17 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 2, &vpx_variance4x4_neon)));
 
 #if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxSseTest,
+    ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxMseTest,
+    ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon_dotprod),
+                      MseParams(4, 3, &vpx_mse16x8_neon_dotprod),
+                      MseParams(3, 4, &vpx_mse8x16_neon_dotprod),
+                      MseParams(3, 3, &vpx_mse8x8_neon_dotprod)));
+
 INSTANTIATE_TEST_SUITE_P(
     NEON_DOTPROD, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod),
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 84a6a761f9..efb2c1d8da 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -232,81 +232,6 @@ VARIANCE_WXH_NEON(64, 64, 12)
 
 #undef VARIANCE_WXH_NEON
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
-                                           int src_stride,
-                                           const unsigned char *ref_ptr,
-                                           int ref_stride, int h) {
-  uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x8_t s0, s1, r0, r1, diff0, diff1;
-
-    s0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    s1 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    r0 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    r1 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-
-    diff0 = vabd_u8(s0, r0);
-    diff1 = vabd_u8(s1, r1);
-
-    sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
-    sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
-  } while (--i != 0);
-
-  return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
-}
-
-static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
-                                            int src_stride,
-                                            const unsigned char *ref_ptr,
-                                            int ref_stride, int h) {
-  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
-    s0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    s1 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    r0 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    r1 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-
-    diff0 = vabdq_u8(s0, r0);
-    diff1 = vabdq_u8(s1, r1);
-
-    sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
-    sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
-}
-
-unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
-                                   const unsigned char *ref_ptr,
-                                   int ref_stride) {
-  uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
-  uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
-
-  uint8x16_t abs_diff = vabdq_u8(s, r);
-
-  uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
-
-  return horizontal_add_uint32x4(sse);
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
                                            int src_stride,
                                            const unsigned char *ref_ptr,
@@ -391,8 +316,6 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
   return horizontal_add_uint32x4(sse);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 #define VPX_MSE_WXH_NEON(w, h)                                               \
   unsigned int vpx_mse##w##x##h##_neon(                                      \
       const unsigned char *src_ptr, int src_stride,                          \
@@ -405,3 +328,5 @@ VPX_MSE_WXH_NEON(8, 8)
 VPX_MSE_WXH_NEON(8, 16)
 VPX_MSE_WXH_NEON(16, 8)
 VPX_MSE_WXH_NEON(16, 16)
+
+#undef VPX_MSE_WXH_NEON
diff --git a/vpx_dsp/arm/variance_neon_dotprod.c b/vpx_dsp/arm/variance_neon_dotprod.c
index a47c355636..ab843e9fca 100644
--- a/vpx_dsp/arm/variance_neon_dotprod.c
+++ b/vpx_dsp/arm/variance_neon_dotprod.c
@@ -209,3 +209,90 @@ VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
 VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
 
 #undef VARIANCE_WXH_NEON_DOTPROD
+
+static INLINE unsigned int vpx_mse8xh_neon_dotprod(const unsigned char *src_ptr,
+                                                   int src_stride,
+                                                   const unsigned char *ref_ptr,
+                                                   int ref_stride, int h) {
+  uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
+
+    sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
+}
+
+static INLINE unsigned int vpx_mse16xh_neon_dotprod(
+    const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr,
+    int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabdq_u8(s0, r0);
+    diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride) {
+  uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+  uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+#define VPX_MSE_WXH_NEON_DOTPROD(w, h)                                   \
+  unsigned int vpx_mse##w##x##h##_neon_dotprod(                          \
+      const unsigned char *src_ptr, int src_stride,                      \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \
+    *sse = vpx_mse##w##xh_neon_dotprod(src_ptr, src_stride, ref_ptr,     \
+                                       ref_stride, h);                   \
+    return *sse;                                                         \
+  }
+
+VPX_MSE_WXH_NEON_DOTPROD(8, 8)
+VPX_MSE_WXH_NEON_DOTPROD(8, 16)
+VPX_MSE_WXH_NEON_DOTPROD(16, 8)
+VPX_MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef VPX_MSE_WXH_NEON_DOTPROD
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 94a821371e..c9cdc285f2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1302,22 +1302,22 @@ ()
   specialize qw/vpx_get8x8var sse2 neon neon_dotprod msa vsx/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_mse16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_mse8x16 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_mse8x8 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
 
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa vsx/;
+  specialize qw/vpx_get4x4sse_cs neon neon_dotprod msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
   specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/;

From 1a1f50a89d2ba6890024464742bf5a01e034fb45 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 19 Aug 2023 23:41:09 +0100
Subject: [PATCH 809/926] Use run-time feature detection for Neon DotProd HBD
 MSE

Arm Neon DotProd implementations of vpx_highbd_8_mse<w>x<h> currently
need to be enabled at compile time since they're guarded by #ifdef
feature macros. Now that run-time feature detection has been enabled
for Arm platforms, expose these implementations with distinct
*neon_dotprod names in a separate file and wire them up to the build
system and rtcd.pl. Also add new test cases for the new functions.

Change-Id: I26be6fb587258c8fa9fbf03509b7602358a001a8
---
 test/variance_test.cc                      | 10 +++
 vpx_dsp/arm/highbd_variance_neon.c         | 67 +--------------
 vpx_dsp/arm/highbd_variance_neon_dotprod.c | 96 ++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk                         |  1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl               |  8 +-
 5 files changed, 113 insertions(+), 69 deletions(-)
 create mode 100644 vpx_dsp/arm/highbd_variance_neon_dotprod.c

diff --git a/test/variance_test.cc b/test/variance_test.cc
index c32c919760..e231df0c6b 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1555,6 +1555,16 @@ INSTANTIATE_TEST_SUITE_P(
         MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
         MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
+#endif  // HAVE_NEON_DOTPROD
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDVarianceTest,
     ::testing::Values(
diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
index e361f6f6f1..309ae7fd35 100644
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -384,69 +384,6 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
   return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
-                                            int src_stride,
-                                            const uint16_t *ref_ptr,
-                                            int ref_stride, int h) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h / 2;
-  do {
-    uint16x8_t s0, s1, r0, r1;
-    uint8x16_t s, r, diff;
-
-    s0 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    s1 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    r0 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-    r1 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-
-    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(sse_u32);
-}
-
-static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
-                                             int src_stride,
-                                             const uint16_t *ref_ptr,
-                                             int ref_stride, int h) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint16x8_t s0, s1, r0, r1;
-    uint8x16_t s, r, diff;
-
-    s0 = vld1q_u16(src_ptr);
-    s1 = vld1q_u16(src_ptr + 8);
-    r0 = vld1q_u16(ref_ptr);
-    r1 = vld1q_u16(ref_ptr + 8);
-
-    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(sse_u32);
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
                                             int src_stride,
                                             const uint16_t *ref_ptr,
@@ -461,8 +398,6 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
   return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 #define HIGHBD_MSE_WXH_NEON(w, h)                                         \
   uint32_t vpx_highbd_8_mse##w##x##h##_neon(                              \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
@@ -497,3 +432,5 @@ HIGHBD_MSE_WXH_NEON(16, 16)
 HIGHBD_MSE_WXH_NEON(16, 8)
 HIGHBD_MSE_WXH_NEON(8, 16)
 HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
diff --git a/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/vpx_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000000..1a88720172
--- /dev/null
+++ b/vpx_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    s1 = vld1q_u16(src_ptr + 8);
+    r0 = vld1q_u16(ref_ptr);
+    r1 = vld1q_u16(ref_ptr + 8);
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                      \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, uint32_t *sse) {                                         \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                              \
+    *sse =                                                                     \
+        highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                               \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 84fd969daa..5343088d1b 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -448,6 +448,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/highbd_variance_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c9cdc285f2..1012df11ec 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1534,14 +1534,14 @@ ()
   specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x8 neon/;
+  specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x16 neon/;
+  specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;

From d549cb74b9fc3f16aa3ed9f59459c791765cd723 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 12 Sep 2023 15:28:06 -0400
Subject: [PATCH 810/926] Add missing headers for clang-tidy warnings

Change-Id: I97edec8ecffdcc79b8f3528deb60a3a0332ea0cc
---
 tools_common.c                           | 2 ++
 vp9/encoder/arm/neon/vp9_quantize_neon.c | 1 +
 vp9/encoder/vp9_encoder.c                | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/tools_common.c b/tools_common.c
index 0fcab2cf29..5c13781513 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -24,6 +24,8 @@
 #include "vpx/vp8dx.h"
 #endif
 
+#include "vpx/vpx_codec.h"
+
 #if defined(_WIN32) || defined(__OS2__)
 #include <io.h>
 #include <fcntl.h>
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 968cdc6d11..96d0614367 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 #include <assert.h>
 #include <math.h>
+#include <stdint.h>
 
 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 869d557dd3..69a4e3c314 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -81,6 +81,8 @@
 #include "vp9/encoder/vp9_tpl_model.h"
 #include "vp9/vp9_cx_iface.h"
 
+#include "vpx/vpx_ext_ratectrl.h"
+
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 

From 391bb5604b85195468e73d576766252f6ce8e427 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Sat, 15 Apr 2023 11:11:16 +0800
Subject: [PATCH 811/926] loongarch: simplify vpx_quantize_b/b_32x32_lsx args

Bug: webm:1755

Change-Id: I42fdb1c34f959dd1204b343b8192e3d9b49821b4
---
 test/vp9_quantize_test.cc        | 14 +++++-----
 vpx_dsp/loongarch/quantize_lsx.c | 46 +++++++++++++++-----------------
 2 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index f6984bd6fa..e00ab4022c 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -695,13 +695,13 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
-                         ::testing::Values(make_tuple(&vpx_quantize_b_lsx,
-                                                      &vpx_quantize_b_c,
-                                                      VPX_BITS_8, 16, false),
-                                           make_tuple(&vpx_quantize_b_32x32_lsx,
-                                                      &vpx_quantize_b_32x32_c,
-                                                      VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_lsx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_lsx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
 
 // Only useful to compare "Speed" test results.
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c
index 77be0bb4fe..9bb1691e2e 100644
--- a/vpx_dsp/loongarch/quantize_lsx.c
+++ b/vpx_dsp/loongarch/quantize_lsx.c
@@ -11,6 +11,8 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_util/loongson_intrinsics.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
                                        __m128i round, __m128i quant,
@@ -88,15 +90,15 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
 }
 
 #if !CONFIG_VP9_HIGHBITDEPTH
+
 void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
-                        const int16_t *zbin_ptr, const int16_t *round_ptr,
-                        const int16_t *quant_ptr,
-                        const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                        int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan) {
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order) {
   __m128i zero = __lsx_vldi(0);
   int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, quant_shift;
   __m128i coeff0, coeff1;
@@ -104,13 +106,11 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan;
-
-  zbin = __lsx_vld(zbin_ptr, 0);
-  round = __lsx_vld(round_ptr, 0);
-  quant = __lsx_vld(quant_ptr, 0);
+  zbin = __lsx_vld(mb_plane->zbin, 0);
+  round = __lsx_vld(mb_plane->round, 0);
+  quant = __lsx_vld(mb_plane->quant, 0);
   dequant = __lsx_vld(dequant_ptr, 0);
-  quant_shift = __lsx_vld(quant_shift_ptr, 0);
+  quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
   // Handle one DC and first 15 AC.
   DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
   qcoeff0 = __lsx_vabsd_h(coeff0, zero);
@@ -167,31 +167,27 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *zbin_ptr, const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
-                              int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct ScanOrder *const scan_order) {
   __m128i zero = __lsx_vldi(0);
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, quant_shift;
   __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-  (void)n_coeffs;
-
-  zbin = __lsx_vld(zbin_ptr, 0);
+  zbin = __lsx_vld(mb_plane->zbin, 0);
   zbin = __lsx_vsrari_h(zbin, 1);
-  round = __lsx_vld(round_ptr, 0);
+  round = __lsx_vld(mb_plane->round, 0);
   round = __lsx_vsrari_h(round, 1);
 
-  quant = __lsx_vld(quant_ptr, 0);
+  quant = __lsx_vld(mb_plane->quant, 0);
   dequant = __lsx_vld(dequant_ptr, 0);
-  quant_shift = __lsx_vld(quant_shift_ptr, 0);
+  quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
   quant_shift = __lsx_vslli_h(quant_shift, 1);
   // Handle one DC and first 15 AC.
   DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);

From eb232b662aeb9ccc76b6dde3f50c3f806b7b58fa Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Tue, 12 Sep 2023 10:30:38 +0800
Subject: [PATCH 812/926] loongarch: Fix bugs from
 vp8_sixtap_predict4x4/16x16_lsx

Bug: webm:1755

Change-Id: I7295e0f9a1551b8a418d5b65a2b7351df1fdc063
---
 test/predict_test.cc                     |  8 ++++++++
 test/test_intra_pred_speed.cc            |  9 +++++++++
 test/vp9_intrapred_test.cc               |  9 +++++++++
 vp8/common/loongarch/sixtap_filter_lsx.c | 23 ++++++++++++-----------
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/test/predict_test.cc b/test/predict_test.cc
index fbf42077b3..474eab2cb5 100644
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -350,6 +350,14 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi)));
 #endif
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_lsx),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_lsx),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_lsx)));
+#endif
+
 class BilinearPredictTest : public PredictTestBase {};
 
 TEST_P(BilinearPredictTest, TestWithRandomData) {
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 15303816b9..b013e0bd5d 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -348,6 +348,15 @@ INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx,
                 vpx_tm_predictor_32x32_vsx)
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INTRA_PRED_TEST(LSX, TestIntraPred8, vpx_dc_predictor_8x8_lsx, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(LSX, TestIntraPred16, vpx_dc_predictor_16x16_lsx, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr)
+#endif  // HAVE_LSX
+
 // -----------------------------------------------------------------------------
 
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index daaf768699..c69d43efbc 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -489,6 +489,15 @@ INSTANTIATE_TEST_SUITE_P(
                                      &vpx_v_predictor_32x32_c, 32, 8)));
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_dc_predictor_8x8_lsx,
+                                     &vpx_dc_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_dc_predictor_16x16_lsx,
+                                     &vpx_dc_predictor_16x16_c, 16, 8)));
+#endif  // HAVE_LSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
                                 const uint16_t *above, const uint16_t *left,
diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c
index cd7ba54746..9867633415 100644
--- a/vp8/common/loongarch/sixtap_filter_lsx.c
+++ b/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -1706,21 +1706,22 @@ void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
       switch (xoffset) {
         case 0: {
           __m128i tp0;
-          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
-          src += src_stride;
-          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
-          src += src_stride;
-          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
-          src += src_stride;
-          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
 
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
           __lsx_vstelm_w(tp0, dst, 0, 0);
           dst += dst_stride;
-          __lsx_vstelm_w(tp0, dst, 0, 1);
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
           dst += dst_stride;
-          __lsx_vstelm_w(tp0, dst, 0, 2);
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
           dst += dst_stride;
-          __lsx_vstelm_w(tp0, dst, 0, 3);
+          tp0 = __lsx_vldrepl_w(src, 0);
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+
           break;
         }
         case 2:
@@ -1865,7 +1866,7 @@ void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
 
               case 1:
                 Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
-                                      h_filter, v_filter + 1, 16);
+                                      h_filter + 1, v_filter + 1, 16);
                 break;
             }
             break;

From 9c2e33ff74a78a491c765478505e04c3c7be6849 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 14 Sep 2023 11:59:57 -0400
Subject: [PATCH 813/926] Set frame width height for 1st TPL GOP frame

Change-Id: Ic92dfd232bf90e8cbe6c6233af523ed40d12097a
---
 vp9/encoder/vp9_tpl_model.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index 02318070c2..e98ee459f0 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -175,7 +175,8 @@ static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
 
 static void init_tpl_stats_before_propagation(
     struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
-    TplDepFrame *tpl_stats, int tpl_gop_frames) {
+    TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width,
+    int frame_height) {
   int frame_idx;
   free_tpl_frame_stats_list(tpl_gop_stats);
   CHECK_MEM_ERROR(
@@ -192,6 +193,8 @@ static void init_tpl_stats_before_propagation(
             sizeof(
                 *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
     tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+    tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width;
+    tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height;
   }
 }
 
@@ -1497,7 +1500,8 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
   init_tpl_stats(cpi);
 
   init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
-                                    cpi->tpl_stats, tpl_group_frames);
+                                    cpi->tpl_stats, tpl_group_frames,
+                                    cpi->common.width, cpi->common.height);
 
   // Backward propagation from tpl_group_frames to 1.
   for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {

From 8e61b3cd1bdd6db56cbd040fae8f7c318180f2a6 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 14 Sep 2023 13:27:39 -0400
Subject: [PATCH 814/926] Fix ref frame buffer in TPL stats for RC

The original ref frame index was the index in the GF group; RC expects
the index to be the one for ref frame buffer.

Change-Id: I9a2b0e72b6332023fb2e8da131b557f82db02e39
---
 vp9/encoder/vp9_tpl_model.c | 13 ++++++++-----
 vpx/vpx_tpl.h               |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index e98ee459f0..b8910370e0 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -393,7 +393,7 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
                                          TplDepStats *tpl_stats, int mi_row,
                                          int mi_col, BLOCK_SIZE bsize,
                                          int stride, int64_t recon_error,
-                                         int64_t rate_cost) {
+                                         int64_t rate_cost, int ref_frame_idx) {
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
@@ -411,7 +411,7 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
       tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
       tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
       tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
-      tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index;
+      tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
     }
   }
 }
@@ -576,7 +576,7 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                             int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
                             YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
                             int64_t *recon_error, int64_t *rate_cost,
-                            int64_t *sse) {
+                            int64_t *sse, int *ref_frame_idx) {
   VP9_COMMON *cm = &cpi->common;
   ThreadData *td = &cpi->td;
 
@@ -723,6 +723,7 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
   tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
   tpl_stats->mv.as_int = best_mv.as_int;
+  *ref_frame_idx = best_rf_idx;
 }
 
 #if CONFIG_NON_GREEDY_MV
@@ -1232,10 +1233,12 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
       int64_t recon_error = 0;
       int64_t rate_cost = 0;
       int64_t sse = 0;
+      // Ref frame index in the ref frame buffer.
+      int ref_frame_idx = -1;
       mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
                       src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
                       tx_size, ref_frame, predictor, &recon_error, &rate_cost,
-                      &sse);
+                      &sse, &ref_frame_idx);
       // Motion flow dependency dispenser.
       tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
                       tpl_frame->stride);
@@ -1243,7 +1246,7 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
       tpl_store_before_propagation(
           tpl_frame_stats_before_propagation->block_stats_list,
           tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
-          recon_error, rate_cost);
+          recon_error, rate_cost, ref_frame_idx);
 
       tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
                        bsize);
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
index 3828eb8f29..a250aada60 100644
--- a/vpx/vpx_tpl.h
+++ b/vpx/vpx_tpl.h
@@ -44,7 +44,7 @@ typedef struct VpxTplBlockStats {
   int16_t mv_c;        /**< Motion vector col */
   int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
   int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
-  int ref_frame_index; /**< Ref frame index */
+  int ref_frame_index; /**< Ref frame index in the ref frame buffer */
 } VpxTplBlockStats;
 
 /*!\brief Temporal dependency model stats for each frame before propagation */

From 8f8e7414684e97ea9b94710ac7853565c8a11c3a Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 14 Sep 2023 16:04:19 -0400
Subject: [PATCH 815/926] Add max/min_gf_interval to vpx_rc_config_t

Bug: b/300499738
Change-Id: Id32cb5e3ce667539c0d1efe1ff5fcc7a49e35329
---
 vp9/vp9_cx_iface.c     | 3 ++-
 vpx/vpx_ext_ratectrl.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 06c0ac1bfb..b1dfe992cf 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1960,7 +1960,8 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
     ratectrl_config.frame_width = frame_info->frame_width;
     ratectrl_config.frame_height = frame_info->frame_height;
     ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames;
-
+    ratectrl_config.max_gf_interval = oxcf->max_gf_interval;
+    ratectrl_config.min_gf_interval = oxcf->min_gf_interval;
     // TODO(angiebird): Double check whether this is the proper way to set up
     // target_bitrate and frame_rate.
     ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000);
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index ef96be6fff..46d290dff4 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -307,6 +307,8 @@ typedef struct vpx_rc_config {
   int frame_width;      /**< frame width */
   int frame_height;     /**< frame height */
   int show_frame_count; /**< number of visible frames in the video */
+  int max_gf_interval;  /**< max GOP size in number of show frames */
+  int min_gf_interval;  /**< min GOP size in number of show frames */
   /*!
    * Target bitrate in kilobytes per second
    */

From ad3301e6a3f70608b17fa8b61527a3bd2c711bbd Mon Sep 17 00:00:00 2001
From: Martin Storsjo <martin@martin.st>
Date: Fri, 15 Sep 2023 12:29:32 +0300
Subject: [PATCH 816/926] aarch64: Generalize Windows cpu detection to any
 Windows variant

This cpu detection implementation doesn't do anything MSVC specific,
it just calls the IsProcessorFeaturePresent function. This can be
compiled with mingw compilers just as well.

Change-Id: I55e607a47c8f5b70d9f707ef96b2fa7553f2f79f
---
 vpx_ports/aarch64_cpudetect.c | 2 +-
 vpx_ports/arm_cpudetect.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
index a3054ad717..dad081c8ce 100644
--- a/vpx_ports/aarch64_cpudetect.c
+++ b/vpx_ports/aarch64_cpudetect.c
@@ -58,7 +58,7 @@ static int arm_get_cpu_caps(void) {
   return flags;
 }
 
-#elif defined(_MSC_VER)  // end __APPLE__
+#elif defined(_WIN32)  // end __APPLE__
 
 static int arm_get_cpu_caps(void) {
   int flags = 0;
diff --git a/vpx_ports/arm_cpudetect.h b/vpx_ports/arm_cpudetect.h
index 24095d1acf..881397abc2 100644
--- a/vpx_ports/arm_cpudetect.h
+++ b/vpx_ports/arm_cpudetect.h
@@ -14,7 +14,7 @@
 #include "vpx_config.h"
 #include "vpx_ports/arm.h"
 
-#if defined(_MSC_VER)
+#if defined(_WIN32)
 #undef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #undef WIN32_EXTRA_LEAN

From 67bfb41ed8598edfb25bd6f245f9c39a68808548 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 21 Sep 2023 15:26:01 -0400
Subject: [PATCH 817/926] Do not call WebM RC for new GOP at the end of seq

define_gf_group is called at the last frame of each GOP to get GOP size
for next one, which means it'll also be called at the last GOP of the
sequence, when calling WebM RC will be returned with error since WebM RC
does not have any more GOP to return.

When gop_coding_frames from the encoder is 1, it means it's running out
of firstpass stats, which means end of sequence.

Bug: b/299610956
Change-Id: I30e077a28fe41593ebabbc1dc0c2915a4bcbece3
---
 test/vp9_ext_ratectrl_test.cc |  4 ++--
 vp9/encoder/vp9_firstpass.c   | 17 +++++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index e0107b2d26..b43430586d 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -338,7 +338,7 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
     EXPECT_EQ(encode_frame_info->show_index, 3);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
   }
 
   // When the model recommends an invalid q, valid range [0, 255],
@@ -398,7 +398,7 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
     EXPECT_EQ(encode_frame_info->show_index, 3);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
   }
 
   // When the model recommends an invalid q, valid range [0, 255],
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 1e6f6f7b3b..3ec7ba5400 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2504,7 +2504,7 @@ static int get_gop_coding_frame_num(
     int *use_alt_ref, const FRAME_INFO *frame_info,
     const TWO_PASS *const twopass, const RATE_CONTROL *rc,
     int gf_start_show_idx, const RANGE *active_gf_interval,
-    double gop_intra_factor, int lag_in_frames) {
+    double gop_intra_factor, int lag_in_frames, int *end_of_sequence) {
   const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double loop_decay_rate = 1.00;
   double mv_ratio_accumulator = 0.0;
@@ -2530,6 +2530,7 @@ static int get_gop_coding_frame_num(
     next_frame = fps_get_frame_stats(first_pass_info,
                                      gf_start_show_idx + gop_coding_frames);
     if (next_frame == NULL) {
+      *end_of_sequence = (gop_coding_frames == 1);
       break;
     }
 
@@ -2720,6 +2721,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   double gop_intra_factor;
   int gop_frames;
   RANGE active_gf_interval;
+  // Whether this is at the end of last GOP of this sequence.
+  int end_of_sequence = 0;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -2751,7 +2754,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
 
   gop_coding_frames = get_gop_coding_frame_num(
       &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
-      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
+      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames,
+      &end_of_sequence);
   use_alt_ref &= allow_alt_ref;
 #if CONFIG_RATE_CTRL
   // If the external gop_command is on, we will override the decisions
@@ -2770,7 +2774,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   // will be overwritten.
   if (cpi->ext_ratectrl.ready &&
       (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
-      cpi->ext_ratectrl.funcs.get_gop_decision != NULL) {
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) {
     vpx_codec_err_t codec_status;
     vpx_rc_gop_decision_t gop_decision;
     vpx_rc_gop_info_t gop_info;
@@ -3805,6 +3809,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
   const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame;
   RANGE active_gf_interval;
   int arf_layers;
+  int end_of_sequence = 0;
   if (oxcf->use_simple_encode_api) {
     active_gf_interval = get_active_gf_inverval_range_simple(
         rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
@@ -3822,9 +3827,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
     gop_intra_factor = 1.0;
   }
 
-  frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc,
-                                         show_idx, &active_gf_interval,
-                                         gop_intra_factor, oxcf->lag_in_frames);
+  frame_count = get_gop_coding_frame_num(
+      use_alt_ref, frame_info, twopass, rc, show_idx, &active_gf_interval,
+      gop_intra_factor, oxcf->lag_in_frames, &end_of_sequence);
   *use_alt_ref &= allow_alt_ref;
   return frame_count;
 }

From af6dedd715f4307669366944cca6e0417b290282 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 25 Sep 2023 18:53:41 -0700
Subject: [PATCH 818/926] encode_api_test: add ConfigResizeChangeThreadCount

Update thread counts and resolution to ensure allocations are updated
correctly. VP8 is disabled to avoid a crash.

Bug: chromium:1486441
Change-Id: Ie89776d9818d27dc351eff298a44c699e850761b
---
 test/encode_api_test.cc | 50 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index c8bd7daa4a..a8a4df2ddf 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -308,7 +308,6 @@ TEST(EncodeAPI, SetRoi) {
 
 void InitCodec(const vpx_codec_iface_t &iface, int width, int height,
                vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) {
-  ASSERT_EQ(vpx_codec_enc_config_default(&iface, cfg, 0), VPX_CODEC_OK);
   cfg->g_w = width;
   cfg->g_h = height;
   cfg->g_lag_in_frames = 0;
@@ -346,6 +345,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) {
         vpx_codec_ctx_t ctx = {};
       } enc;
 
+      ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
       EXPECT_NO_FATAL_FAILURE(
           InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg));
       if (IsVP9(iface)) {
@@ -364,6 +364,54 @@ TEST(EncodeAPI, ConfigChangeThreadCount) {
   }
 }
 
+TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
+  constexpr int kInitWidth = 1024;
+  constexpr int kInitHeight = 1024;
+
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    if (!IsVP9(iface)) {
+      GTEST_SKIP() << "TODO(https://crbug.com/1486441) remove this condition "
+                      "after VP8 is fixed.";
+    }
+    for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
+      vpx_codec_enc_cfg_t cfg = {};
+      struct Encoder {
+        ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+        vpx_codec_ctx_t ctx = {};
+      } enc;
+
+      ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+      // Start in threaded mode to ensure resolution and thread related
+      // allocations are updated correctly across changes in resolution and
+      // thread counts. See https://crbug.com/1486441.
+      cfg.g_threads = 4;
+      EXPECT_NO_FATAL_FAILURE(
+          InitCodec(*iface, kInitWidth, kInitHeight, &enc.ctx, &cfg));
+      if (IsVP9(iface)) {
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6),
+                  VPX_CODEC_OK);
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i),
+                  VPX_CODEC_OK);
+      }
+
+      cfg.g_w = 1000;
+      cfg.g_h = 608;
+      EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK)
+          << vpx_codec_error_detail(&enc.ctx);
+
+      cfg.g_w = 16;
+      cfg.g_h = 720;
+
+      for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
+        cfg.g_threads = threads;
+        EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+            << "iteration: " << i << " threads: " << threads;
+      }
+    }
+  }
+}
+
 #if CONFIG_VP9_ENCODER
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,

From 3fbd1dca6a4d2dad332a2110d646e4ffef36d590 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 25 Sep 2023 18:55:59 -0700
Subject: [PATCH 819/926] VP8: disallow thread count changes

Currently allocations are done at encoder creation time. Going from
threaded to non-threaded would cause a crash.

Bug: chromium:1486441
Change-Id: Ie301c2a70847dff2f0daae408fbef1e4d42e73d4
---
 test/encode_api_test.cc | 4 ----
 vp8/encoder/onyx_if.c   | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index a8a4df2ddf..f1c98b2c71 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -370,10 +370,6 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
 
   for (const auto *iface : kCodecIfaces) {
     SCOPED_TRACE(vpx_codec_iface_name(iface));
-    if (!IsVP9(iface)) {
-      GTEST_SKIP() << "TODO(https://crbug.com/1486441) remove this condition "
-                      "after VP8 is fixed.";
-    }
     for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
       vpx_codec_enc_cfg_t cfg = {};
       struct Encoder {
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c65afc643b..c5e9970c3c 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1447,6 +1447,11 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
   last_h = cpi->oxcf.Height;
   prev_number_of_layers = cpi->oxcf.number_of_layers;
 
+  if (cpi->initial_width) {
+    // TODO(https://crbug.com/1486441): Allow changing thread counts; the
+    // allocation is done once in vp8_create_compressor().
+    oxcf->multi_threaded = cpi->oxcf.multi_threaded;
+  }
   cpi->oxcf = *oxcf;
 
   switch (cpi->oxcf.Mode) {

From 61f868bcf78d167df63ac9e792871f75b6b6e1f0 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Thu, 21 Sep 2023 13:49:09 -0700
Subject: [PATCH 820/926] Modify vp9_c_vs_simd_enc_test script

Applied James' change to the script. Enabled the test:
vp9_c_vs_simd_enc_test

BUG=webm:1800

Change-Id: If1e33e5ccb6ca9315004f3e3f5b910f8a8255317
---
 test/examples.sh             |  2 +-
 test/vp9_c_vs_simd_encode.sh | 74 ++++++++++++++++++------------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/test/examples.sh b/test/examples.sh
index c15a367f3c..629f04239c 100755
--- a/test/examples.sh
+++ b/test/examples.sh
@@ -15,7 +15,7 @@
 example_tests=$(ls $(dirname $0)/*.sh)
 
 # List of script names to exclude.
-exclude_list="examples stress tools_common vp9_c_vs_simd_encode"
+exclude_list="examples stress tools_common"
 
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
index e3d3624ed4..dfd4b93a1f 100755
--- a/test/vp9_c_vs_simd_encode.sh
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -10,6 +10,8 @@
 ##
 ##  This script checks the bit exactness between C and SIMD
 ##  implementations of VP9 encoder.
+##
+. $(dirname $0)/tools_common.sh
 
 TEST_BITRATES="1600 6400"
 PRESETS="good rt"
@@ -17,7 +19,6 @@ TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input
 OUT_FILE_SUFFIX=".ivf"
 SCRIPT_DIR=$(dirname "$0")
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd)
-devnull='> /dev/null 2>&1'
 
 # Clips used in test.
 YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
@@ -87,9 +88,11 @@ vp9_c_vs_simd_enc_verify_environment () {
   fi
 }
 
-cleanup() {
-  rm -rf  ${VPX_TEST_OUTPUT_DIR}
-}
+# This is not needed since tools_common.sh does the same cleanup.
+# Keep the code here for our reference.
+# cleanup() {
+#   rm -rf  ${VPX_TEST_OUTPUT_DIR}
+# }
 
 # Echo VPX_SIMD_CAPS_MASK for different instruction set architecture.
 avx512f() {
@@ -143,10 +146,12 @@ y4m_360p_10bit_input() {
 
 has_x86_isa_extn() {
   instruction_set=$1
-  grep -q "$instruction_set" /proc/cpuinfo
-  if [ $? -eq 1 ]; then
+  if ! grep -q "$instruction_set" /proc/cpuinfo; then
+    # This instruction_set is not supported.
     return 1
   fi
+  # This instruction_set is supported.
+  return 0
 }
 
 # Echo good encode params for use with VP9 encoder.
@@ -238,9 +243,8 @@ compare_enc_output() {
   local clip=$3
   local bitrate=$4
   local preset=$5
-  diff ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
-       ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} > /dev/null
-  if [ $? -eq 1 ]; then
+  if ! diff -q ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+       ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then
     elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset"
     return 1
   fi
@@ -281,8 +285,8 @@ vp9_enc_test() {
           ${devnull}
 
           if [ "${target}" != "generic-gnu" ]; then
-            compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}
-            if [ $? -eq 1 ]; then
+            if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then
+              # Find the mismatch
               return 1
             fi
           fi
@@ -322,8 +326,7 @@ vp9_test_generic() {
 vp9_test_x86() {
   local arch=$1
 
-  uname -m | grep -q "x86"
-  if [ $? -eq 1 ]; then
+  if ! uname -m | grep -q "x86"; then
     elog "Machine architecture is not x86 or x86_64"
     return 0
   fi
@@ -341,14 +344,14 @@ vp9_test_x86() {
   vp9_enc_build ${target} ${configure}
   local encoder="$(vp9_enc_tool_path "${target}")"
   for isa in $x86_isa_variants; do
-    has_x86_isa_extn $isa
-    if [ $? -eq 1 ]; then
+    # Note that if has_x86_isa_extn returns 1, it is false, and vice versa.
+    if ! has_x86_isa_extn $isa; then
       echo "${isa} is not supported in this machine"
       continue
     fi
     export VPX_SIMD_CAPS_MASK=$($isa)
-    vp9_enc_test $encoder ${target}
-    if [ $? -eq 1 ]; then
+    if ! vp9_enc_test $encoder ${target}; then
+      # Find the mismatch
       return 1
     fi
     unset VPX_SIMD_CAPS_MASK
@@ -363,8 +366,8 @@ vp9_test_arm() {
   vp9_enc_build ${target} "${configure}"
 
   local encoder="$(vp9_enc_tool_path "${target}")"
-  vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}
-  if [ $? -eq 1 ]; then
+  if ! vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}; then
+    # Find the mismatch
     return 1
   fi
 }
@@ -373,21 +376,23 @@ vp9_c_vs_simd_enc_test () {
   # Test Generic
   vp9_test_generic
 
+  # TODO(webm:1816): Enable x86 test once issue 1816 is fixed.
+  # Details: https://bugs.chromium.org/p/webm/issues/detail?id=1816
   # Test x86 (32 bit)
-  echo "vp9 test for x86 (32 bit): Started."
-  vp9_test_x86 "x86"
-  if [ $? -eq 1 ]; then
-    echo "vp9 test for x86 (32 bit): Done, test failed."
-  else
-    echo "vp9 test for x86 (32 bit): Done, all tests passed."
-  fi
+  # echo "vp9 test for x86 (32 bit): Started."
+  # if ! vp9_test_x86 "x86"; then
+  #   echo "vp9 test for x86 (32 bit): Done, test failed."
+  #   return 1
+  # else
+  #   echo "vp9 test for x86 (32 bit): Done, all tests passed."
+  # fi
 
   # Test x86_64 (64 bit)
   if [ "$(eval uname -m)" = "x86_64" ]; then
     echo "vp9 test for x86_64 (64 bit): Started."
-    vp9_test_x86 "x86_64"
-    if [ $? -eq 1 ]; then
+    if ! vp9_test_x86 "x86_64"; then
       echo "vp9 test for x86_64 (64 bit): Done, test failed."
+      return 1
     else
       echo "vp9 test for x86_64 (64 bit): Done, all tests passed."
     fi
@@ -395,20 +400,15 @@ vp9_c_vs_simd_enc_test () {
 
   # Test ARM
   echo "vp9_test_arm: Started."
-  vp9_test_arm
-  if [ $? -eq 1 ]; then
+  if ! vp9_test_arm; then
     echo "vp9 test for arm: Done, test failed."
+    return 1
   else
     echo "vp9 test for arm: Done, all tests passed."
   fi
 }
 
 # Setup a trap function to clean up build, and output files after tests complete.
-trap cleanup EXIT
+# trap cleanup EXIT
 
-vp9_c_vs_simd_enc_verify_environment
-if [ $? -eq 1 ]; then
-  echo "Environment check failed."
-  exit 1
-fi
-vp9_c_vs_simd_enc_test
+run_tests vp9_c_vs_simd_enc_verify_environment vp9_c_vs_simd_enc_test

From 452199ca85a3c968d31115345109c6d00d2a485b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 25 Sep 2023 18:53:41 -0700
Subject: [PATCH 821/926] encode_api_test: add ConfigResizeChangeThreadCount

Update thread counts and resolution to ensure allocations are updated
correctly. VP8 is disabled to avoid a crash.

Bug: chromium:1486441
Change-Id: Ie89776d9818d27dc351eff298a44c699e850761b
(cherry picked from commit af6dedd715f4307669366944cca6e0417b290282)
---
 test/encode_api_test.cc | 50 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index ecdf928343..02aedc0575 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -304,7 +304,6 @@ TEST(EncodeAPI, SetRoi) {
 
 void InitCodec(const vpx_codec_iface_t &iface, int width, int height,
                vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) {
-  ASSERT_EQ(vpx_codec_enc_config_default(&iface, cfg, 0), VPX_CODEC_OK);
   cfg->g_w = width;
   cfg->g_h = height;
   cfg->g_lag_in_frames = 0;
@@ -342,6 +341,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) {
         vpx_codec_ctx_t ctx = {};
       } enc;
 
+      ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
       EXPECT_NO_FATAL_FAILURE(
           InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg));
       if (IsVP9(iface)) {
@@ -360,4 +360,52 @@ TEST(EncodeAPI, ConfigChangeThreadCount) {
   }
 }
 
+TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
+  constexpr int kInitWidth = 1024;
+  constexpr int kInitHeight = 1024;
+
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    if (!IsVP9(iface)) {
+      GTEST_SKIP() << "TODO(https://crbug.com/1486441) remove this condition "
+                      "after VP8 is fixed.";
+    }
+    for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
+      vpx_codec_enc_cfg_t cfg = {};
+      struct Encoder {
+        ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+        vpx_codec_ctx_t ctx = {};
+      } enc;
+
+      ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+      // Start in threaded mode to ensure resolution and thread related
+      // allocations are updated correctly across changes in resolution and
+      // thread counts. See https://crbug.com/1486441.
+      cfg.g_threads = 4;
+      EXPECT_NO_FATAL_FAILURE(
+          InitCodec(*iface, kInitWidth, kInitHeight, &enc.ctx, &cfg));
+      if (IsVP9(iface)) {
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6),
+                  VPX_CODEC_OK);
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i),
+                  VPX_CODEC_OK);
+      }
+
+      cfg.g_w = 1000;
+      cfg.g_h = 608;
+      EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK)
+          << vpx_codec_error_detail(&enc.ctx);
+
+      cfg.g_w = 16;
+      cfg.g_h = 720;
+
+      for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
+        cfg.g_threads = threads;
+        EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+            << "iteration: " << i << " threads: " << threads;
+      }
+    }
+  }
+}
+
 }  // namespace

From baed1218776fba096c05c1c683564ba4523d17e5 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 25 Sep 2023 18:55:59 -0700
Subject: [PATCH 822/926] VP8: disallow thread count changes

Currently allocations are done at encoder creation time. Going from
threaded to non-threaded would cause a crash.

Bug: chromium:1486441
Change-Id: Ie301c2a70847dff2f0daae408fbef1e4d42e73d4
(cherry picked from commit 3fbd1dca6a4d2dad332a2110d646e4ffef36d590)
---
 test/encode_api_test.cc | 4 ----
 vp8/encoder/onyx_if.c   | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 02aedc0575..e0e793b156 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -366,10 +366,6 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
 
   for (const auto *iface : kCodecIfaces) {
     SCOPED_TRACE(vpx_codec_iface_name(iface));
-    if (!IsVP9(iface)) {
-      GTEST_SKIP() << "TODO(https://crbug.com/1486441) remove this condition "
-                      "after VP8 is fixed.";
-    }
     for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
       vpx_codec_enc_cfg_t cfg = {};
       struct Encoder {
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 4bbeadef01..148a16cc49 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1443,6 +1443,11 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
   last_h = cpi->oxcf.Height;
   prev_number_of_layers = cpi->oxcf.number_of_layers;
 
+  if (cpi->initial_width) {
+    // TODO(https://crbug.com/1486441): Allow changing thread counts; the
+    // allocation is done once in vp8_create_compressor().
+    oxcf->multi_threaded = cpi->oxcf.multi_threaded;
+  }
   cpi->oxcf = *oxcf;
 
   switch (cpi->oxcf.Mode) {

From 263682c9a29395055f3b3afe2d97be1828a6223f Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 30 Jun 2022 13:48:56 -0400
Subject: [PATCH 823/926] Fix bug with smaller width bigger size

Fixed previous patch that clusterfuzz failed on.

Bug: webm:1642
Change-Id: If0e08e72abd2e042efe4dcfac21e4cc51afdfdb9
---
 test/resize_test.cc          | 11 +++--------
 vp9/common/vp9_alloccommon.c | 13 ++++++-------
 vp9/encoder/vp9_encoder.c    | 27 +++++++++++++++++++++++++--
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index fd1c2a92de..20ad2229b4 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -102,11 +102,8 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     if (frame < 30) {
       return;
     }
-    if (frame < 100) {
-      *w = initial_w * 7 / 10;
-      *h = initial_h * 16 / 10;
-      return;
-    }
+    *w = initial_w * 7 / 10;
+    *h = initial_h * 16 / 10;
     return;
   }
   if (frame < 10) {
@@ -559,9 +556,7 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   }
 }
 
-// TODO(https://crbug.com/webm/1642): This causes a segfault in
-// init_encode_frame_mb_context().
-TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) {
+TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) {
   ResizingVideoSource video;
   video.flag_codec_ = true;
   video.smaller_width_larger_size_ = true;
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index e53883f621..9e73e40ea0 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -135,13 +135,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->free_mi(cm);
     if (cm->alloc_mi(cm, new_mi_size)) goto fail;
   }
-
-  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
-    // Create the segmentation map structure and set to 0.
-    free_seg_map(cm);
-    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
-  }
-
   if (cm->above_context_alloc_cols < cm->mi_cols) {
     vpx_free(cm->above_context);
     cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
@@ -156,6 +149,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
+  }
+
   if (vp9_alloc_loop_filter(cm)) goto fail;
 
   return 0;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 69a4e3c314..e3ba294c32 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2047,6 +2047,17 @@ static void alloc_copy_partition_data(VP9_COMP *cpi) {
   }
 }
 
+static void free_copy_partition_data(VP9_COMP *cpi) {
+  vpx_free(cpi->prev_partition);
+  cpi->prev_partition = NULL;
+  vpx_free(cpi->prev_segment_id);
+  cpi->prev_segment_id = NULL;
+  vpx_free(cpi->prev_variance_low);
+  cpi->prev_variance_low = NULL;
+  vpx_free(cpi->copied_frame_cnt);
+  cpi->copied_frame_cnt = NULL;
+}
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2126,6 +2137,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
     if (cm->mi_alloc_size < new_mi_size) {
       vp9_free_context_buffers(cm);
+      vp9_free_pc_tree(&cpi->td);
+      vpx_free(cpi->mbmi_ext_base);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -2144,8 +2157,18 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     update_frame_size(cpi);
 
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
-    memset(cpi->consec_zero_mv, 0,
-           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+    vpx_free(cpi->consec_zero_mv);
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->consec_zero_mv,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
+
+    vpx_free(cpi->skin_map);
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->skin_map,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+
+    free_copy_partition_data(cpi);
+    alloc_copy_partition_data(cpi);
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_reset_resize(cpi);
     rc->rc_1_frame = 0;

From 58a854fb271f99a15f3d2687587f74a6d12c8512 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 27 Sep 2023 16:12:12 -0700
Subject: [PATCH 824/926] Skip the y4m_360p_10bit_input clip for armv8

It is a known mismatch.

Bug: webm:1819
Change-Id: Ieb707a6f53ddf6c7b0d1202c6520599d3e45da76
---
 test/vp9_c_vs_simd_encode.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
index dfd4b93a1f..22503545a1 100755
--- a/test/vp9_c_vs_simd_encode.sh
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -277,6 +277,10 @@ vp9_enc_test() {
 
     for cpu in $(seq 0 $max_cpu_used); do
       for clip in ${TEST_CLIPS}; do
+        # TODO(webm:1819): Delete this if statement once issue 1819 is fixed.
+        if [ "${clip}" = "y4m_360p_10bit_input" ] && [ "${target}" = "armv8-linux-gcc" ]; then
+          continue
+        fi
         for bitrate in ${TEST_BITRATES}; do
           eval "${encoder}" $($clip) $($test_params) \
           "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \

From e462a0de03fc4daca2ba32a5ba29c4182263e43c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 28 Sep 2023 08:58:36 -0700
Subject: [PATCH 825/926] vp9_c_vs_simd_encode: Restore cwd at end of test

Restore the original current directory at the end of
vp9_c_vs_simd_enc_test().

Bug: webm:1800
Change-Id: Iad64848a231e3c900149cc2b248055b02dda80a6
---
 test/vp9_c_vs_simd_encode.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
index 22503545a1..82939470f6 100755
--- a/test/vp9_c_vs_simd_encode.sh
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -65,7 +65,7 @@ vp9_enc_tool_path() {
 }
 
 # Environment check: Make sure input and source directories are available.
-vp9_c_vs_simd_enc_verify_environment () {
+vp9_c_vs_simd_enc_verify_environment() {
   if [ ! -e "${YUV_RAW_INPUT}" ]; then
     elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
@@ -376,7 +376,9 @@ vp9_test_arm() {
   fi
 }
 
-vp9_c_vs_simd_enc_test () {
+vp9_c_vs_simd_enc_test() {
+  local save_dir=$(pwd)
+
   # Test Generic
   vp9_test_generic
 
@@ -410,6 +412,8 @@ vp9_c_vs_simd_enc_test () {
   else
     echo "vp9 test for arm: Done, all tests passed."
   fi
+
+  cd ${save_dir}
 }
 
 # Setup a trap function to clean up build, and output files after tests complete.

From fd2052d4c91ca9451563482817bbeab1da3f76e7 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Thu, 28 Sep 2023 17:47:54 -0700
Subject: [PATCH 826/926] Properly determine end of sequence

When the next frame is null and the current frame is an overlay
frame, which is equivalent to there is an active alt ref frame,
we call this an end of sequence.

Change-Id: I49c2cf7a001df98aff8b62ba034317e408274bd4
---
 test/vp9_ext_ratectrl_test.cc | 2 +-
 vp9/encoder/vp9_firstpass.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index b43430586d..33fa05c65c 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -338,7 +338,7 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
     EXPECT_EQ(encode_frame_info->show_index, 3);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
   }
 
   // When the model recommends an invalid q, valid range [0, 255],
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3ec7ba5400..a9cdf5353f 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2530,7 +2530,7 @@ static int get_gop_coding_frame_num(
     next_frame = fps_get_frame_stats(first_pass_info,
                                      gf_start_show_idx + gop_coding_frames);
     if (next_frame == NULL) {
-      *end_of_sequence = (gop_coding_frames == 1);
+      *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active;
       break;
     }
 

From 4697b110ac3c4da19bfbd742432548820f0f38de Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 29 Sep 2023 10:35:20 -0700
Subject: [PATCH 827/926] Update 32-bit version of horizontal_add_uint32x2

The code was originally added in
https://aomedia-review.googlesource.com/c/aom/+/162267
by Jonathan Wright.

Change-Id: Iafd9e31d82abe22387e0d68f02c7ab81e85367ed
---
 vpx_dsp/arm/sum_neon.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 48a2fc05ca..75c170df60 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -154,7 +154,8 @@ static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
 #if VPX_ARCH_AARCH64
   return vaddv_u32(a);
 #else
-  return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
 #endif
 }
 

From a53700e4a3820ad929e2b8c79d10e46abef62575 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 26 Jun 2023 19:06:51 -0700
Subject: [PATCH 828/926] vp9_alloccommon: clear allocation sizes on free

This fixes reallocations (and avoids potential crashes) if any
allocations fails and the application continues to call
vpx_codec_decode().

Found with vpx_dec_fuzzer_vp9 & Nallocfuzz
(https://github.com/catenacyber/nallocfuzz).

Bug: webm:1807
Change-Id: If5dc96b73c02efc94ec84c25eb50d10ad6b645a6
(cherry picked from commit 02ab555e992c191e5c509ed87b3cc48ed915b447)
---
 vp9/common/vp9_alloccommon.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index faad657a08..e53883f621 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -65,6 +65,7 @@ static void free_seg_map(VP9_COMMON *cm) {
     vpx_free(cm->seg_map_array[i]);
     cm->seg_map_array[i] = NULL;
   }
+  cm->seg_map_alloc_size = 0;
 
   cm->current_frame_seg_map = NULL;
   cm->last_frame_seg_map = NULL;
@@ -108,6 +109,7 @@ void vp9_free_context_buffers(VP9_COMMON *cm) {
   cm->above_context = NULL;
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
+  cm->above_context_alloc_cols = 0;
   vpx_free(cm->lf.lfm);
   cm->lf.lfm = NULL;
 }

From df9fd9d5b7325060b2b921558a1eb20ca7880937 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 30 Jun 2022 13:48:56 -0400
Subject: [PATCH 829/926] Fix bug with smaller width bigger size

Fixed previous patch that clusterfuzz failed on.

Local fuzzing passing overnight.

Bug: webm:1642
Change-Id: If0e08e72abd2e042efe4dcfac21e4cc51afdfdb9
(cherry picked from commit 263682c9a29395055f3b3afe2d97be1828a6223f)
---
 test/resize_test.cc          | 11 +++--------
 vp9/common/vp9_alloccommon.c | 13 ++++++-------
 vp9/encoder/vp9_encoder.c    | 27 +++++++++++++++++++++++++--
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/test/resize_test.cc b/test/resize_test.cc
index 715bb9d70f..d9420a4548 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -102,11 +102,8 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     if (frame < 30) {
       return;
     }
-    if (frame < 100) {
-      *w = initial_w * 7 / 10;
-      *h = initial_h * 16 / 10;
-      return;
-    }
+    *w = initial_w * 7 / 10;
+    *h = initial_h * 16 / 10;
     return;
   }
   if (frame < 10) {
@@ -559,9 +556,7 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   }
 }
 
-// TODO(https://crbug.com/webm/1642): This causes a segfault in
-// init_encode_frame_mb_context().
-TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) {
+TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) {
   ResizingVideoSource video;
   video.flag_codec_ = true;
   video.smaller_width_larger_size_ = true;
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index e53883f621..9e73e40ea0 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -135,13 +135,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->free_mi(cm);
     if (cm->alloc_mi(cm, new_mi_size)) goto fail;
   }
-
-  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
-    // Create the segmentation map structure and set to 0.
-    free_seg_map(cm);
-    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
-  }
-
   if (cm->above_context_alloc_cols < cm->mi_cols) {
     vpx_free(cm->above_context);
     cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
@@ -156,6 +149,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
+  }
+
   if (vp9_alloc_loop_filter(cm)) goto fail;
 
   return 0;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b66fdc0bca..e385077545 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1973,6 +1973,17 @@ static void alloc_copy_partition_data(VP9_COMP *cpi) {
   }
 }
 
+static void free_copy_partition_data(VP9_COMP *cpi) {
+  vpx_free(cpi->prev_partition);
+  cpi->prev_partition = NULL;
+  vpx_free(cpi->prev_segment_id);
+  cpi->prev_segment_id = NULL;
+  vpx_free(cpi->prev_variance_low);
+  cpi->prev_variance_low = NULL;
+  vpx_free(cpi->copied_frame_cnt);
+  cpi->copied_frame_cnt = NULL;
+}
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2052,6 +2063,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
     if (cm->mi_alloc_size < new_mi_size) {
       vp9_free_context_buffers(cm);
+      vp9_free_pc_tree(&cpi->td);
+      vpx_free(cpi->mbmi_ext_base);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -2070,8 +2083,18 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     update_frame_size(cpi);
 
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
-    memset(cpi->consec_zero_mv, 0,
-           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+    vpx_free(cpi->consec_zero_mv);
+    CHECK_MEM_ERROR(
+        cm, cpi->consec_zero_mv,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
+
+    vpx_free(cpi->skin_map);
+    CHECK_MEM_ERROR(
+        cm, cpi->skin_map,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+
+    free_copy_partition_data(cpi);
+    alloc_copy_partition_data(cpi);
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_reset_resize(cpi);
     rc->rc_1_frame = 0;

From 490a7067e806092cb85a7f0b9534700c33b95199 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Sep 2023 11:21:35 -0700
Subject: [PATCH 830/926] update version to 1.13.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SO_VERSION_MAJOR = 8
SO_VERSION_MINOR = 0
SO_VERSION_PATCH = 1

The increase of the patch number corresponds to the revision number in
the libtool text.

3. If the library source code has changed at all since the last update,
then increment revision (‘c:r:a’ becomes ‘c:r+1:a’).

Bug: webm:1818
Change-Id: Ia114368e9fd7a908e7fcf6e4d3142f142770e3f4
---
 README  | 2 +-
 libs.mk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README b/README
index e360df05f6..444289a237 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-v1.13.0 Ugly Duckling
+v1.13.1 Ugly Duckling
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
diff --git a/libs.mk b/libs.mk
index 1f7f03aa38..13d878901b 100644
--- a/libs.mk
+++ b/libs.mk
@@ -314,7 +314,7 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
 SO_VERSION_MAJOR := 8
 SO_VERSION_MINOR := 0
-SO_VERSION_PATCH := 0
+SO_VERSION_PATCH := 1
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
 SHARED_LIB_SUF          := .dylib

From 10b9492dcf05b652e2e4b370e205bd605d421972 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Sep 2023 11:32:27 -0700
Subject: [PATCH 831/926] update CHANGELOG

Bug: webm:1818
Change-Id: Ic0a943b5d1c69a3621ad3f91012fb5308a0c11f1
---
 CHANGELOG | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 3fb2d19bbe..f932f6bf4d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,13 @@
+2023-09-29 v1.13.1 "Ugly Duckling"
+  This release contains two security related fixes. One each for VP8 and VP9.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+
+  - Bug fixes:
+    https://crbug.com/1486441 (CVE-2023-5217)
+    Fix to a crash related to VP9 encoding (#1642)
+
 2023-01-31 v1.13.0 "Ugly Duckling"
   This release includes more Neon and AVX2 optimizations, adds a new codec
   control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes

From 7f568f98762547a22d4e82d0ddf64986c6407c34 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 22 Sep 2023 15:50:17 -0700
Subject: [PATCH 832/926] Fix a potential resource leak and add alloc checks

Backport fixes from libaom:
https://aomedia-review.googlesource.com/c/aom/+/109061
https://aomedia-review.googlesource.com/c/aom/+/158102

Change-Id: Ia9d42d474be2898f9ae2fdc28606273377da3e90
---
 examples/resize_util.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/examples/resize_util.c b/examples/resize_util.c
index 7e529b2e20..5fb63e1660 100644
--- a/examples/resize_util.c
+++ b/examples/resize_util.c
@@ -52,6 +52,7 @@ int main(int argc, char *argv[]) {
   uint8_t *inbuf_v, *outbuf_v;
   int f, frames;
   int width, height, target_width, target_height;
+  int failed = 0;
 
   exec_name = argv[0];
 
@@ -82,6 +83,7 @@ int main(int argc, char *argv[]) {
   }
   fpout = fopen(fout, "wb");
   if (fpout == NULL) {
+    fclose(fpin);
     printf("Can't open file %s to write\n", fout);
     usage();
     return 1;
@@ -100,6 +102,11 @@ int main(int argc, char *argv[]) {
 
   inbuf = (uint8_t *)malloc(width * height * 3 / 2);
   outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
+  if (!(inbuf && outbuf)) {
+    printf("Failed to allocate buffers.\n");
+    failed = 1;
+    goto Error;
+  }
   inbuf_u = inbuf + width * height;
   inbuf_v = inbuf_u + width * height / 4;
   outbuf_u = outbuf + target_width * target_height;
@@ -114,10 +121,11 @@ int main(int argc, char *argv[]) {
     f++;
   }
   printf("%d frames processed\n", f);
+Error:
   fclose(fpin);
   fclose(fpout);
 
   free(inbuf);
   free(outbuf);
-  return 0;
+  return failed;
 }

From ec9e1ed41f49c4ca2d8b180ebdcdb249a479acdc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Sep 2023 15:16:46 -0700
Subject: [PATCH 833/926] vp9_encoder: normalize sizeof() calls

use sizeof(var) instead of sizeof(type) and sizeof(*var) instead of
sizeof(var[0]) for consistency in some places.

Change-Id: Ibd9a783cfef5ce1d06131df3831a4093821a502f
---
 vp9/encoder/vp9_encoder.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e3ba294c32..993e6310eb 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -881,10 +881,11 @@ static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
   if (!cm->prev_mip) return 1;
   cm->mi_alloc_size = mi_size;
 
-  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+  cm->mi_grid_base =
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
   if (!cm->mi_grid_base) return 1;
   cm->prev_mi_grid_base =
-      (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
   if (!cm->prev_mi_grid_base) return 1;
 
   return 0;
@@ -2165,7 +2166,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     vpx_free(cpi->skin_map);
     CHECK_MEM_ERROR(
         &cm->error, cpi->skin_map,
-        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
 
     free_copy_partition_data(cpi);
     alloc_copy_partition_data(cpi);
@@ -2379,7 +2380,7 @@ void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt) {
 VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
-  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi));
   VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
   if (!cm) return NULL;
@@ -2428,7 +2429,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 
   CHECK_MEM_ERROR(
       &cm->error, cpi->skin_map,
-      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
 
 #if !CONFIG_REALTIME_ONLY
   CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());

From 6512f994da13e2f27e6a7bd449efee0a374b55b7 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 29 Sep 2023 16:58:26 -0700
Subject: [PATCH 834/926] Disable vpx_highbd_8_mse16x16_neon_dotprod, etc.

These functions assume the uint16_t samples are <= 255 (bit depth 8),
but vpx_highbd_8_mse16x16() is called for any bit depth, not just 8.

A better fix is to port the libaom CL
https://aomedia-review.googlesource.com/c/aom/+/175063 to libvpx, but
that requires porting aom_sse() and aom_highbd_sse() to libvpx, which is
quite involved. So disable vpx_highbd_8_mse16x16_neon_dotprod, etc.
first.

Bug: webm:1819
Change-Id: If495a5dedc58d9981317b9993c9fbb81ff3ab50c
---
 test/variance_test.cc        | 4 ++++
 test/vp9_c_vs_simd_encode.sh | 4 ----
 vpx_dsp/vpx_dsp.mk           | 1 -
 vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 ++++----
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index e231df0c6b..b8320e9ceb 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1555,6 +1555,9 @@ INSTANTIATE_TEST_SUITE_P(
         MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
         MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
 
+// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can
+// be used again.
+#if 0
 #if HAVE_NEON_DOTPROD
 INSTANTIATE_TEST_SUITE_P(
     NEON_DOTPROD, VpxHBDMseTest,
@@ -1564,6 +1567,7 @@ INSTANTIATE_TEST_SUITE_P(
         MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
         MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
 #endif  // HAVE_NEON_DOTPROD
+#endif  // 0
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDVarianceTest,
diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
index 82939470f6..42e5f3b589 100755
--- a/test/vp9_c_vs_simd_encode.sh
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -277,10 +277,6 @@ vp9_enc_test() {
 
     for cpu in $(seq 0 $max_cpu_used); do
       for clip in ${TEST_CLIPS}; do
-        # TODO(webm:1819): Delete this if statement once issue 1819 is fixed.
-        if [ "${clip}" = "y4m_360p_10bit_input" ] && [ "${target}" = "armv8-linux-gcc" ]; then
-          continue
-        fi
         for bitrate in ${TEST_BITRATES}; do
           eval "${encoder}" $($clip) $($test_params) \
           "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 5343088d1b..84fd969daa 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -448,7 +448,6 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
-DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/highbd_variance_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 1012df11ec..c9cdc285f2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1534,14 +1534,14 @@ ()
   specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
+  specialize qw/vpx_highbd_8_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
+  specialize qw/vpx_highbd_8_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;

From b863d8bb47c2fd6bb3ad9d502532d34a13575b23 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 28 Sep 2023 16:35:54 -0700
Subject: [PATCH 835/926] Have vp9_enc_build and vp9_enc_test restore cwd

Use $PWD to get the current directory.

Quote directory pathnames.

Suggested by James Zern.

Bug: webm:1800
Change-Id: I51e922b24da0e89d936370f858eab55d193ebdcb
---
 test/vp9_c_vs_simd_encode.sh | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
index 82939470f6..5bb343bfaa 100755
--- a/test/vp9_c_vs_simd_encode.sh
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -18,7 +18,7 @@ PRESETS="good rt"
 TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input"
 OUT_FILE_SUFFIX=".ivf"
 SCRIPT_DIR=$(dirname "$0")
-LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd)
+LIBVPX_SOURCE_DIR=$(cd "${SCRIPT_DIR}/.."; pwd)
 
 # Clips used in test.
 YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
@@ -214,14 +214,15 @@ vp9_encode_rt_params() {
   --error-resilient=0"
 }
 
-# Configures for the given target in VPX_TEST_OUTPUT_DIR/build_target_${target}
-# directory.
+# Configures for the given target in the
+# ${VPX_TEST_OUTPUT_DIR}/build_target_${target} directory.
 vp9_enc_build() {
   local target=$1
   local configure="$2"
   local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
-  mkdir -p $tmp_build_dir
-  cd $tmp_build_dir
+  mkdir -p "$tmp_build_dir"
+  local save_dir="$PWD"
+  cd "$tmp_build_dir"
 
   echo "Building target: ${target}"
   local config_args="--disable-install-docs \
@@ -235,6 +236,7 @@ vp9_enc_build() {
   eval "$configure" --target="${target}" "${config_args}" ${devnull}
   eval make -j$(nproc) ${devnull}
   echo "Done building target: ${target}"
+  cd "${save_dir}"
 }
 
 compare_enc_output() {
@@ -258,6 +260,9 @@ vp9_enc_test() {
     return 1
   fi
 
+  local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
+  local save_dir="$PWD"
+  cd "$tmp_build_dir"
   for preset in ${PRESETS}; do
     if [ "${preset}" = "good" ]; then
       local max_cpu_used=5
@@ -267,6 +272,7 @@ vp9_enc_test() {
       local test_params=vp9_encode_rt_params
     else
       elog "Invalid preset"
+      cd "${save_dir}"
       return 1
     fi
 
@@ -291,6 +297,7 @@ vp9_enc_test() {
           if [ "${target}" != "generic-gnu" ]; then
             if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then
               # Find the mismatch
+              cd "${save_dir}"
               return 1
             fi
           fi
@@ -298,6 +305,7 @@ vp9_enc_test() {
       done
     done
   done
+  cd "${save_dir}"
 }
 
 vp9_test_generic() {
@@ -377,8 +385,6 @@ vp9_test_arm() {
 }
 
 vp9_c_vs_simd_enc_test() {
-  local save_dir=$(pwd)
-
   # Test Generic
   vp9_test_generic
 
@@ -412,8 +418,6 @@ vp9_c_vs_simd_enc_test() {
   else
     echo "vp9 test for arm: Done, all tests passed."
   fi
-
-  cd ${save_dir}
 }
 
 # Setup a trap function to clean up build, and output files after tests complete.

From b729684b059d48b6bac0750045acbe6a4a9e9a6b Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 30 Sep 2023 19:45:35 -0700
Subject: [PATCH 836/926] Use big cfg.g_w in ConfigResizeChangeThreadCount

vp8cx_create_encoder_threads() caps the thread count at
(cm->mb_cols / cpi->mt_sync_range) - 1. If cfg.g_w is 16, cm->mb_cols is
only 1 (see vp8_alloc_frame_buffers: mb_cols = width >> 4), so we won't
be using multiple threads. To reproduce bug chromium:1486441, the test
just needs to increase cfg.g_h sufficiently.

Bug: chromium:1486441
Change-Id: Ie6b2da2e31cfa1717a481f55eebc8c875db94d87
---
 test/encode_api_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index f1c98b2c71..6b22febf6e 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -396,7 +396,7 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
       EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK)
           << vpx_codec_error_detail(&enc.ctx);
 
-      cfg.g_w = 16;
+      cfg.g_w = 1000;
       cfg.g_h = 720;
 
       for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {

From 0a3e2b4ca1540f4a52848b79fcfad62391fcdba8 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 3 Oct 2023 09:58:23 -0400
Subject: [PATCH 837/926] Factor out common code used in test binaries

Bug: b/303112617
Change-Id: Icbe16e95ff334a9578a692cc51b4773393ad0005
---
 test/init_vpx_test.cc         | 74 ++++++++++++++++++++++++++++++++
 test/init_vpx_test.h          | 18 ++++++++
 test/test.mk                  |  4 ++
 test/test_intra_pred_speed.cc |  8 +++-
 test/test_libvpx.cc           | 81 +----------------------------------
 5 files changed, 105 insertions(+), 80 deletions(-)
 create mode 100644 test/init_vpx_test.cc
 create mode 100644 test/init_vpx_test.h

diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc
new file mode 100644
index 0000000000..5b40d9e4f7
--- /dev/null
+++ b/test/init_vpx_test.cc
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/init_vpx_test.h"
+
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+extern "C" {
+#if CONFIG_VP8
+extern void vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+extern void vp9_rtcd();
+#endif  // CONFIG_VP9
+extern void vpx_dsp_rtcd();
+extern void vpx_scale_rtcd();
+}
+
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+static void append_negative_gtest_filter(const char *str) {
+  std::string filter = GTEST_FLAG_GET(filter);
+  // Negative patterns begin with one '-' followed by a ':' separated list.
+  if (filter.find('-') == std::string::npos) filter += '-';
+  filter += str;
+  GTEST_FLAG_SET(filter, filter);
+}
+#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
+
+namespace libvpx_test {
+void init_vpx_test() {
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+  const int simd_caps = x86_simd_caps();
+  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
+  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
+  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
+  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
+  if (!(simd_caps & HAS_SSSE3)) {
+    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
+  }
+  if (!(simd_caps & HAS_SSE4_1)) {
+    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
+  }
+  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
+  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+  if (!(simd_caps & HAS_AVX512)) {
+    append_negative_gtest_filter(":AVX512.*:AVX512/*");
+  }
+#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
+
+#if !CONFIG_SHARED
+// Shared library builds don't support whitebox tests
+// that exercise internal symbols.
+#if CONFIG_VP8
+  vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+  vp9_rtcd();
+#endif  // CONFIG_VP9
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+#endif  // !CONFIG_SHARED
+}
+}  // namespace libvpx_test
diff --git a/test/init_vpx_test.h b/test/init_vpx_test.h
new file mode 100644
index 0000000000..39ed6525b3
--- /dev/null
+++ b/test/init_vpx_test.h
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_TEST_COMMON_MAIN_H_
+#define TEST_TEST_COMMON_MAIN_H_
+
+namespace libvpx_test {
+void init_vpx_test();
+}
+
+#endif
diff --git a/test/test.mk b/test/test.mk
index b64e89bb43..d4521f08bf 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -7,6 +7,8 @@ LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += md5_helper.h
 LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
+LIBVPX_TEST_SRCS-yes += init_vpx_test.cc
+LIBVPX_TEST_SRCS-yes += init_vpx_test.h
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
 LIBVPX_TEST_SRCS-yes += test_vectors.cc
 LIBVPX_TEST_SRCS-yes += test_vectors.h
@@ -215,6 +217,8 @@ endif
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
+TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.h
 
 RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc
 RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index b013e0bd5d..4c464a262f 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -14,9 +14,11 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
+#include "test/init_vpx_test.h"
 #include "test/md5_helper.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -607,4 +609,8 @@ HIGHBD_INTRA_PRED_TEST(
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#include "test/test_libvpx.cc"
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::libvpx_test::init_vpx_test();
+  return RUN_ALL_TESTS();
+}
diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index caab2dbd01..c1798b8b8b 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -7,89 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <string>
 
+#include "test/init_vpx_test.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vpx_config.h"
-#if VPX_ARCH_ARM
-#include "vpx_ports/arm.h"
-#endif
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
-#include "vpx_ports/x86.h"
-#endif
-extern "C" {
-#if CONFIG_VP8
-extern void vp8_rtcd();
-#endif  // CONFIG_VP8
-#if CONFIG_VP9
-extern void vp9_rtcd();
-#endif  // CONFIG_VP9
-extern void vpx_dsp_rtcd();
-extern void vpx_scale_rtcd();
-}
-
-#if (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64
-static void append_negative_gtest_filter(const char *str) {
-  std::string filter = ::testing::FLAGS_gtest_filter;
-  // Negative patterns begin with one '-' followed by a ':' separated list.
-  if (filter.find('-') == std::string::npos) filter += '-';
-  filter += str;
-  ::testing::FLAGS_gtest_filter = filter;
-}
-#endif  // (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64
-
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
-
-#if !CONFIG_SHARED
-#if VPX_ARCH_AARCH64
-  const int caps = arm_cpu_caps();
-  if (!(caps & HAS_NEON_DOTPROD)) {
-    append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*");
-  }
-  if (!(caps & HAS_NEON_I8MM)) {
-    append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*");
-  }
-#elif VPX_ARCH_ARM
-  const int caps = arm_cpu_caps();
-  if (!(caps & HAS_NEON)) {
-    append_negative_gtest_filter(":NEON.*:NEON/*");
-  }
-#endif  // VPX_ARCH_ARM
-#endif  // !CONFIG_SHARED
-
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
-  const int simd_caps = x86_simd_caps();
-  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
-  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
-  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
-  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
-  if (!(simd_caps & HAS_SSSE3)) {
-    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
-  }
-  if (!(simd_caps & HAS_SSE4_1)) {
-    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
-  }
-  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
-  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
-  if (!(simd_caps & HAS_AVX512)) {
-    append_negative_gtest_filter(":AVX512.*:AVX512/*");
-  }
-#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
-
-#if !CONFIG_SHARED
-// Shared library builds don't support whitebox tests
-// that exercise internal symbols.
-#if CONFIG_VP8
-  vp8_rtcd();
-#endif  // CONFIG_VP8
-#if CONFIG_VP9
-  vp9_rtcd();
-#endif  // CONFIG_VP9
-  vpx_dsp_rtcd();
-  vpx_scale_rtcd();
-#endif  // !CONFIG_SHARED
-
+  ::libvpx_test::init_vpx_test();
   return RUN_ALL_TESTS();
 }

From 5b6ceba996f08e2502737eec14f291ac8d46a5bc Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 3 Oct 2023 10:14:43 -0400
Subject: [PATCH 838/926] Include vpx_config.h for macros

Clear some clang-tidy complaints

Change-Id: I6690428d336c81709befd19a33e11c1367275df3
---
 vpx_ports/aarch32_cpudetect.c | 1 +
 vpx_ports/aarch64_cpudetect.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/vpx_ports/aarch32_cpudetect.c b/vpx_ports/aarch32_cpudetect.c
index 48bdc70f92..639f4ff8ea 100644
--- a/vpx_ports/aarch32_cpudetect.c
+++ b/vpx_ports/aarch32_cpudetect.c
@@ -9,6 +9,7 @@
  */
 // Feature detection code for Armv7-A / AArch32.
 
+#include "./vpx_config.h"
 #include "arm_cpudetect.h"
 
 #if !CONFIG_RUNTIME_CPU_DETECT
diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
index dad081c8ce..ac68f44452 100644
--- a/vpx_ports/aarch64_cpudetect.c
+++ b/vpx_ports/aarch64_cpudetect.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_config.h"
 #include "arm_cpudetect.h"
 
 #if defined(__APPLE__)

From f73026c2ccc276ee1493a3cd733149fe8ed7df33 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 3 Oct 2023 22:00:02 -0400
Subject: [PATCH 839/926] Use correct include guards for init_vpx_test.h

Bug: b/303112617
Change-Id: Ie18df33b2bcab91c18e920825f4ed3a29e18373b
---
 test/init_vpx_test.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/init_vpx_test.h b/test/init_vpx_test.h
index 39ed6525b3..5e0dbb0e7e 100644
--- a/test/init_vpx_test.h
+++ b/test/init_vpx_test.h
@@ -8,11 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_TEST_COMMON_MAIN_H_
-#define TEST_TEST_COMMON_MAIN_H_
+#ifndef TEST_INIT_VPX_TEST_H_
+#define TEST_INIT_VPX_TEST_H_
 
 namespace libvpx_test {
 void init_vpx_test();
 }
 
-#endif
+#endif  // TEST_INIT_VPX_TEST_H_

From f67f9ce3469fba7585c1a9fc4e5f1700b5433a39 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 30 Sep 2023 11:17:30 -0700
Subject: [PATCH 840/926] Declare cur_row inside #if CONFIG_MULTITHREAD

Fix the following compiler warning when libvpx is configured with
the --disable-multithread option:

  vp9/common/vp9_thread_common.c:391:7: warning:
  variable 'cur_row' set but not used [-Wunused-but-set-variable]
    int cur_row;
        ^

Change-Id: I53aa279152715083df40990eb7fdcaeb77a66777
---
 vp9/common/vp9_thread_common.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index 1c6ecc0fe6..8df18af3b8 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -388,10 +388,10 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
 
 static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
   int return_val = -1;
-  int cur_row;
   const int max_rows = cm->mi_rows;
 
 #if CONFIG_MULTITHREAD
+  int cur_row;
   const int tile_cols = 1 << cm->log2_tile_cols;
 
   pthread_mutex_lock(lf_sync->lf_mutex);
@@ -428,14 +428,8 @@ static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
 #else
   (void)lf_sync;
   if (cm->lf_row < max_rows) {
-    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
     return_val = cm->lf_row;
     cm->lf_row += MI_BLOCK_SIZE;
-    if (cm->lf_row < max_rows) {
-      /* If this is not the last row, make sure the next row is also decoded.
-       * This is because the intra predict has to happen before loop filter */
-      cur_row += 1;
-    }
   }
 #endif  // CONFIG_MULTITHREAD
 

From ea67878f8c36f96c54947a72f759c955ce69cb80 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 3 Oct 2023 20:08:18 -0700
Subject: [PATCH 841/926] Clean up vp8cx_create/remove_encoder_threads()

Make vp8cx_create_encoder_threads() undo everything cleanly before
returning an error.

Make vp8cx_remove_encoder_threads() reset pointer fields to NULL after
freeing them, reset encoding_thread_count to 0, and reset b_lpf_running
to 0 (false). This makes it safe to call vp8cx_create_encoder_threads()
after calling vp8cx_remove_encoder_threads().

Change-Id: I586f06ce3d5b1c88ca46884bb4d6667ffc97e440
---
 vp8/encoder/ethreading.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index b7f1932c58..353496cc57 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -553,6 +553,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       /* shutdown other threads */
       vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
+        sem_post(&cpi->h_event_start_encoding[ithread]);
+        sem_post(&cpi->h_event_end_encoding[ithread]);
         pthread_join(cpi->h_encoding_thread[ithread], 0);
         sem_destroy(&cpi->h_event_start_encoding[ithread]);
         sem_destroy(&cpi->h_event_end_encoding[ithread]);
@@ -560,10 +562,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
       /* free thread related resources */
       vpx_free(cpi->h_event_start_encoding);
+      cpi->h_event_start_encoding = NULL;
       vpx_free(cpi->h_event_end_encoding);
+      cpi->h_event_end_encoding = NULL;
       vpx_free(cpi->h_encoding_thread);
+      cpi->h_encoding_thread = NULL;
       vpx_free(cpi->mb_row_ei);
+      cpi->mb_row_ei = NULL;
       vpx_free(cpi->en_thread_data);
+      cpi->en_thread_data = NULL;
+      cpi->encoding_thread_count = 0;
 
       return -1;
     }
@@ -592,10 +600,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
         /* free thread related resources */
         vpx_free(cpi->h_event_start_encoding);
+        cpi->h_event_start_encoding = NULL;
         vpx_free(cpi->h_event_end_encoding);
+        cpi->h_event_end_encoding = NULL;
         vpx_free(cpi->h_encoding_thread);
+        cpi->h_encoding_thread = NULL;
         vpx_free(cpi->mb_row_ei);
+        cpi->mb_row_ei = NULL;
         vpx_free(cpi->en_thread_data);
+        cpi->en_thread_data = NULL;
+        cpi->encoding_thread_count = 0;
 
         return -2;
       }
@@ -627,13 +641,20 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
 
     sem_destroy(&cpi->h_event_end_lpf);
     sem_destroy(&cpi->h_event_start_lpf);
+    cpi->b_lpf_running = 0;
 
     /* free thread related resources */
     vpx_free(cpi->h_event_start_encoding);
+    cpi->h_event_start_encoding = NULL;
     vpx_free(cpi->h_event_end_encoding);
+    cpi->h_event_end_encoding = NULL;
     vpx_free(cpi->h_encoding_thread);
+    cpi->h_encoding_thread = NULL;
     vpx_free(cpi->mb_row_ei);
+    cpi->mb_row_ei = NULL;
     vpx_free(cpi->en_thread_data);
+    cpi->en_thread_data = NULL;
+    cpi->encoding_thread_count = 0;
   }
 }
 #endif

From 41caf8fef5dbfd56d1d79303964b91875d3925a3 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Tue, 3 Oct 2023 14:46:36 +0530
Subject: [PATCH 842/926] Add an emms instruction to vpx_subtract_block

This CL adds an `emms` instruction at the end of the MMX assembly
for the vpx_subtract_block function, to properly clear the register
state. This resolves a mismatch between x86 build and C only build.

BUG=webm:1816

Change-Id: I79d2947da7f587f3558a2ae17df214d2faf59e74
---
 vpx_dsp/x86/subtract_sse2.asm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm
index 4273efb854..e3055ab292 100644
--- a/vpx_dsp/x86/subtract_sse2.asm
+++ b/vpx_dsp/x86/subtract_sse2.asm
@@ -124,4 +124,5 @@ INIT_MMX
   lea                predq, [predq+pred_str*2]
   sub                rowsd, 2
   jg .loop_4
+  emms
   RET

From c23da380a386e2cae2c757f06a2aebfd72451413 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 3 Oct 2023 19:30:12 -0700
Subject: [PATCH 843/926] VP8: Allocate cpi->mt_current_mb_col array lazily

Add the mt_current_mb_col_size field to VP8_COMP to record the size of
the mt_current_mb_col array.

Move the allocation of the mt_current_mb_col array from
vp8_alloc_compressor_data() to vp8_encode_frame(), where the use of
mt_current_mb_col starts. Allocate mt_current_mb_col right before use
if mt_current_mb_col hasn't been allocated or if the current size is
incorrect.

Move the deallocation of the mt_current_mb_col array from
dealloc_compressor_data() to vp8cx_remove_encoder_threads().

Move the TODO(https://crbug.com/1486441) comment from
vp8/encoder/onyx_if.c to vp8/vp8_cx_iface.c.

Change-Id: Ic5a0793278c2cc94876669aaa0dd732412876673
---
 vp8/encoder/encodeframe.c |  9 +++++++++
 vp8/encoder/ethreading.c  |  3 +++
 vp8/encoder/onyx_if.c     | 21 ---------------------
 vp8/encoder/onyx_int.h    |  1 +
 vp8/vp8_cx_iface.c        |  2 ++
 5 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index dc29945729..3f0319a54b 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -750,6 +750,15 @@ void vp8_encode_frame(VP8_COMP *cpi) {
       vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
                                 cpi->encoding_thread_count);
 
+      if (cpi->mt_current_mb_col_size != cm->mb_rows) {
+        vpx_free(cpi->mt_current_mb_col);
+        cpi->mt_current_mb_col = NULL;
+        cpi->mt_current_mb_col_size = 0;
+        CHECK_MEM_ERROR(
+            &cpi->common.error, cpi->mt_current_mb_col,
+            vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+        cpi->mt_current_mb_col_size = cm->mb_rows;
+      }
       for (i = 0; i < cm->mb_rows; ++i)
         vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
 
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 353496cc57..3362755094 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -644,6 +644,9 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
     cpi->b_lpf_running = 0;
 
     /* free thread related resources */
+    vpx_free(cpi->mt_current_mb_col);
+    cpi->mt_current_mb_col = NULL;
+    cpi->mt_current_mb_col_size = 0;
     vpx_free(cpi->h_event_start_encoding);
     cpi->h_event_start_encoding = NULL;
     vpx_free(cpi->h_event_end_encoding);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c5e9970c3c..890237f7b2 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -442,11 +442,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) {
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-#if CONFIG_MULTITHREAD
-  vpx_free(cpi->mt_current_mb_col);
-  cpi->mt_current_mb_col = NULL;
-#endif
 }
 
 static void enable_segmentation(VP8_COMP *cpi) {
@@ -1224,17 +1219,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   } else {
     cpi->mt_sync_range = 16;
   }
-
-  if (cpi->oxcf.multi_threaded > 1) {
-    int i;
-
-    vpx_free(cpi->mt_current_mb_col);
-    CHECK_MEM_ERROR(&cpi->common.error, cpi->mt_current_mb_col,
-                    vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
-    for (i = 0; i < cm->mb_rows; ++i)
-      vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
-  }
-
 #endif
 
   vpx_free(cpi->tplist);
@@ -1447,11 +1431,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
   last_h = cpi->oxcf.Height;
   prev_number_of_layers = cpi->oxcf.number_of_layers;
 
-  if (cpi->initial_width) {
-    // TODO(https://crbug.com/1486441): Allow changing thread counts; the
-    // allocation is done once in vp8_create_compressor().
-    oxcf->multi_threaded = cpi->oxcf.multi_threaded;
-  }
   cpi->oxcf = *oxcf;
 
   switch (cpi->oxcf.Mode) {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 4304f054ca..2c6a55a845 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -526,6 +526,7 @@ typedef struct VP8_COMP {
 #if CONFIG_MULTITHREAD
   /* multithread data */
   vpx_atomic_int *mt_current_mb_col;
+  int mt_current_mb_col_size;
   int mt_sync_range;
   vpx_atomic_int b_multi_threaded;
   int encoding_thread_count;
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 8950de0d8a..470fe9e6df 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -488,6 +488,8 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
   ctx->cfg = *cfg;
   set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
   vp8_change_config(ctx->cpi, &ctx->oxcf);
+  // TODO(https://crbug.com/1486441): Change thread counts;
+  // vp8cx_create_encoder_threads() is called once in vp8_create_compressor().
   ctx->cpi->common.error.setjmp = 0;
   return VPX_CODEC_OK;
 }

From 8cb4544c21a221e04fe21222349431ba1779d884 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 30 Sep 2023 15:50:14 -0700
Subject: [PATCH 844/926] VP8: allow thread count changes

Fix the TODO(https://crbug.com/1486441) comment in vp8/vp8_cx_iface.c.

Make vp8cx_create_encoder_threads() work after it has been called
before. If there are already the exact number of threads it needs to
create, return immediately. Otherwise, shut down the existing threads
(by calling vp8cx_remove_encoder_threads()) and create the required
number of threads.

Call vp8cx_create_encoder_threads() in vp8e_set_config() to respond to
changes in g_threads or g_w (which also affects the number of threads
through cm->mb_cols and cpi->mt_sync_range).

Change-Id: I552eeca5b1f1f5313f59559eb1da396f270a2429
---
 vp8/encoder/ethreading.c | 16 ++++++++--------
 vp8/vp8_cx_iface.c       |  8 ++++++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 3362755094..9993905567 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -487,15 +487,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
 int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
   const VP8_COMMON *cm = &cpi->common;
-
-  vpx_atomic_init(&cpi->b_multi_threaded, 0);
-  cpi->encoding_thread_count = 0;
-  cpi->b_lpf_running = 0;
+  int th_count = 0;
 
   if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) {
-    int ithread;
-    int th_count = cpi->oxcf.multi_threaded - 1;
-    int rc = 0;
+    th_count = cpi->oxcf.multi_threaded - 1;
 
     /* don't allocate more threads than cores available */
     if (cpi->oxcf.multi_threaded > cm->processor_core_count) {
@@ -507,8 +502,13 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     if (th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) {
       th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
     }
+  }
+  if (th_count == cpi->encoding_thread_count) return 0;
 
-    if (th_count == 0) return 0;
+  vp8cx_remove_encoder_threads(cpi);
+  if (th_count != 0) {
+    int ithread;
+    int rc = 0;
 
     CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
                     vpx_malloc(sizeof(pthread_t) * th_count));
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 470fe9e6df..20c44ff4e1 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -19,6 +19,9 @@
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_util/vpx_timestamp.h"
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
+#endif
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
@@ -488,8 +491,9 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
   ctx->cfg = *cfg;
   set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
   vp8_change_config(ctx->cpi, &ctx->oxcf);
-  // TODO(https://crbug.com/1486441): Change thread counts;
-  // vp8cx_create_encoder_threads() is called once in vp8_create_compressor().
+#if CONFIG_MULTITHREAD
+  if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR;
+#endif
   ctx->cpi->common.error.setjmp = 0;
   return VPX_CODEC_OK;
 }

From 7c31749387e0297a16289ba851559df4fd2f935d Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 30 Sep 2023 17:31:09 -0700
Subject: [PATCH 845/926] Declare some "VP8_CONFIG *oxcf" params as const

Change-Id: Ia5e8445098e18da5978aacf17281f16252413f17
---
 vp8/common/onyx.h      | 2 +-
 vp8/encoder/onyx_if.c  | 6 +++---
 vp8/encoder/onyx_int.h | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 8c35e433e7..7f7f567c6a 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -240,7 +240,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
 
 void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
-void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf);
+void vp8_change_config(struct VP8_COMP *cpi, const VP8_CONFIG *oxcf);
 
 int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 890237f7b2..12a99584f9 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -270,7 +270,7 @@ static int rescale(int val, int num, int denom) {
   return (int)(llval * llnum / llden);
 }
 
-void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
                                      const int layer,
                                      double prev_layer_framerate) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
@@ -328,7 +328,7 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
 // for any "new" layers. For "existing" layers, let them inherit the parameters
 // from the previous layer state (at the same layer #). In future we may want
 // to better map the previous layer state(s) to the "new" ones.
-void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
                                      const int prev_num_layers) {
   int i;
   double prev_layer_framerate = 0;
@@ -1412,7 +1412,7 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) {
   }
 }
 
-void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
+void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
   int last_w, last_h;
   unsigned int prev_number_of_layers;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 2c6a55a845..2f06702a1d 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -714,9 +714,9 @@ void vp8_initialize_enc(void);
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
-void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
                                      const int prev_num_layers);
-void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
                                      const int layer,
                                      double prev_layer_framerate);
 void vp8_update_layer_contexts(VP8_COMP *cpi);

From 9c377eafbedd3911e83ee80793b4fba80710d4e0 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 7 Oct 2023 07:06:36 -0700
Subject: [PATCH 846/926] Handle Arm/AArch64 runtime feature detection

Port the following libaom CLs to libvpx:
https://aomedia-review.googlesource.com/c/aom/+/178361
https://aomedia-review.googlesource.com/c/aom/+/180701
https://aomedia-review.googlesource.com/c/aom/+/181821

The tests themselves are not feature-gated in the same way that they are
used in the rest of the codebase since they are not controlled by
rtcd.pl. This means that tests that assume the existence of features not
present on the target can cause SIGILL to be thrown.

This commit extends init_vpx_test.cc to match the behaviour for other
targets and automatically disable testing for features that are not
available on the machine running the tests.

Call arm_cpu_caps() and x86_simd_caps() inside #if !CONFIG_SHARED.
All the SIMD-specialized functions (arm or x86) are internal functions,
so they are not exported from the libvpx shared library. If
CONFIG_SHARED is 1, it is not necessary to call arm_cpu_caps(),
x86_simd_caps(), and append_negative_gtest_filter() either.

Change-Id: I330631816bdb52842020c5aa2a1ad802865cc285
---
 test/init_vpx_test.cc | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc
index 5b40d9e4f7..e88c54f323 100644
--- a/test/init_vpx_test.cc
+++ b/test/init_vpx_test.cc
@@ -10,9 +10,14 @@
 
 #include "test/init_vpx_test.h"
 
+#include "./vpx_config.h"
+
+#if !CONFIG_SHARED
 #include <string>
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
+#if VPX_ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
 #include "vpx_ports/x86.h"
 #endif
@@ -27,7 +32,7 @@ extern void vpx_dsp_rtcd();
 extern void vpx_scale_rtcd();
 }
 
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+#if VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64
 static void append_negative_gtest_filter(const char *str) {
   std::string filter = GTEST_FLAG_GET(filter);
   // Negative patterns begin with one '-' followed by a ':' separated list.
@@ -35,10 +40,25 @@ static void append_negative_gtest_filter(const char *str) {
   filter += str;
   GTEST_FLAG_SET(filter, filter);
 }
-#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
+#endif  // VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64
+#endif  // !CONFIG_SHARED
 
 namespace libvpx_test {
 void init_vpx_test() {
+#if !CONFIG_SHARED
+#if VPX_ARCH_AARCH64
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON_DOTPROD)) {
+    append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*");
+  }
+  if (!(caps & HAS_NEON_I8MM)) {
+    append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*");
+  }
+#elif VPX_ARCH_ARM
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
+#endif  // VPX_ARCH_ARM
+
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
   const int simd_caps = x86_simd_caps();
   if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
@@ -58,9 +78,8 @@ void init_vpx_test() {
   }
 #endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
 
-#if !CONFIG_SHARED
-// Shared library builds don't support whitebox tests
-// that exercise internal symbols.
+  // Shared library builds don't support whitebox tests that exercise internal
+  // symbols.
 #if CONFIG_VP8
   vp8_rtcd();
 #endif  // CONFIG_VP8

From 2ab7ba82511eceb24d45361a0ba81a1460f5dbfc Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 10 Oct 2023 17:43:16 -0400
Subject: [PATCH 847/926] Force mode search on 64x64 if no mode is selected

A speed feature disable_split_mask (set to 63) could cause no mode and
partition to be selected in rd_pick_partition because:

-> thresh_mult_sub8x8 all INT_MAX
-> All modes skipped for sub8x8 blocks
-> found_best_rd is 0 -> break from the loop of 4 sub blocks
-> sum_rdc is INT_MAX -> No rd update -> should_encode_sb is 0
-> Propagating to top of the tree
-> No partition / mode is selected

Bug: b/290499385
Change-Id: Ia655e262f3b32445347ae0aaf1a2d868cea997f3
---
 vp9/encoder/vp9_encodeframe.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 0d03d01c80..67869596b1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -4419,6 +4419,19 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
+  if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) {
+    vp9_rd_cost_reset(&this_rdc);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64,
+                     ctx, INT_MAX, INT64_MAX);
+    ctx->rdcost = this_rdc.rdcost;
+    vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc);
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      should_encode_sb = 1;
+      pc_tree->partitioning = PARTITION_NONE;
+    }
+  }
+
   *rd_cost = best_rdc;
 
   if (should_encode_sb && pc_tree->index != 3) {

From 0129e64a65594a2af81df5df2ad474dd168a1519 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 16 Oct 2023 11:22:48 -0400
Subject: [PATCH 848/926] Fix ubsan failure caused by left shift of negative

Bug: b/305642441
Change-Id: Iddb1572c284161140da48f61b04cf600e5b57ecc
---
 vp8/encoder/mcomp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index b92e2135e9..bc150e482b 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1123,8 +1123,8 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = best_mv->as_mv.row * 8;
+  this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
@@ -1441,8 +1441,8 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);

From e4db6c3aacb3fbcbb939f132915234988f8617c1 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Mon, 16 Oct 2023 16:26:18 -0400
Subject: [PATCH 849/926] Cap avg_frame_bandwidth at INT_MAX

avg_frame_bandwidth = target_bandwidth / framerate

If target_bandwidth is too big and/or framerate is too small (< 1),
avg_frame_bandwidth could be overflow

Bug: chromium:1492864
Change-Id: I32314da1414b472ae4bf2acdcd81b8a948286146
---
 test/encode_api_test.cc            | 23 +++++++++++++++++++++++
 vp9/encoder/vp9_ratectrl.c         |  3 ++-
 vp9/encoder/vp9_svc_layercontext.c | 12 ++++++++----
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 6b22febf6e..8e90af9113 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -409,6 +409,29 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
 }
 
 #if CONFIG_VP9_ENCODER
+TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
+  constexpr int kWidth = 16383;
+  constexpr int kHeight = 16383;
+  constexpr auto *iface = &vpx_codec_vp9_cx_algo;
+  SCOPED_TRACE(vpx_codec_iface_name(iface));
+  vpx_codec_enc_cfg_t cfg = {};
+  struct Encoder {
+    ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+    vpx_codec_ctx_t ctx = {};
+  } enc;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  // The following setting will cause avg_frame_bandwidth in rate control to be
+  // larger than INT_MAX
+  cfg.rc_target_bitrate = INT_MAX;
+  cfg.g_timebase.den = 1;
+  cfg.g_timebase.num = 10;
+  EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg));
+  EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+      << "target bitrate: " << cfg.rc_target_bitrate << " framerate: "
+      << static_cast<double>(cfg.g_timebase.den) / cfg.g_timebase.num;
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index fe7414687a..7f4761dfab 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2643,7 +2643,8 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int vbr_max_bits;
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->avg_frame_bandwidth =
+      (int)VPXMIN(oxcf->target_bandwidth / cpi->framerate, INT_MAX);
   rc->min_frame_bandwidth =
       (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 24fd818133..0df34bf459 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -232,7 +232,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
             VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
         lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
         lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-        lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+        lrc->avg_frame_bandwidth =
+            (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
         lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
         lrc->worst_quality = rc->worst_quality;
         lrc->best_quality = rc->best_quality;
@@ -272,7 +273,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
       } else {
         lc->framerate = cpi->framerate;
       }
-      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->avg_frame_bandwidth =
+          (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
       lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
       // Update qp-related quantities.
       lrc->worst_quality = rc->worst_quality;
@@ -314,7 +316,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   const int tl = svc->temporal_layer_id;
 
   lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->avg_frame_bandwidth =
+      (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
   if (tl == 0) {
@@ -336,7 +339,8 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
   RATE_CONTROL *const lrc = &lc->rc;
 
   lc->framerate = framerate;
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->avg_frame_bandwidth =
+      (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
   lrc->min_frame_bandwidth =
       (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
   lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *

From 424723dc025ce451dab9568239a46b18d0919b4d Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 19 Oct 2023 10:06:50 -0400
Subject: [PATCH 850/926] Run bitrate overflow test only on 64bit systems

Frame size caps the target bitrate internally, so the frame size needs
to be large enough to reproduce the target bitrate overflow in the
fuzzing test.

However the frame size needed exceeds the max buffer allowed on 32bit
system defined by VPX_MAX_ALLOCABLE_MEMORY

Bug: chromium:1492864

Change-Id: Ia3a9a78cd35516373897039a7769b492e29e8450
---
 test/encode_api_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 8e90af9113..cf89fd1f24 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -409,6 +409,7 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
 }
 
 #if CONFIG_VP9_ENCODER
+#if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
 TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
   constexpr int kWidth = 16383;
   constexpr int kHeight = 16383;
@@ -431,6 +432,7 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
       << "target bitrate: " << cfg.rc_target_bitrate << " framerate: "
       << static_cast<double>(cfg.g_timebase.den) / cfg.g_timebase.num;
 }
+#endif  // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
 
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,

From 9004ace97802e2e6bb3de67952fd140c95d43e6e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 19 Oct 2023 18:28:48 -0700
Subject: [PATCH 851/926] Also test VPX_ARCH_AARCH64 for 64-bit platforms

Change-Id: Ic11ccd791ff78801e0aba1d12ad2d99b9941ce9d
---
 test/realtime_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index 88e510fd0d..a9870b3cbf 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -95,7 +95,7 @@ TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
 
 TEST_P(RealtimeTest, IntegerOverflowLarge) {
   if (IsVP9()) {
-#if VPX_ARCH_X86_64
+#if VPX_ARCH_AARCH64 || VPX_ARCH_X86_64
     TestIntegerOverflow(16384, 16384);
 #else
     TestIntegerOverflow(4096, 4096);

From 352f9f64df62c8673f54651a90cd6d3935c34c6f Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 20 Oct 2023 14:25:14 -0400
Subject: [PATCH 852/926] Reduce memory usage of test with large frame size

 - Use smaller frame size that still triggers the overflow
 - Do not run encoder as the encoder init also triggers the overflow

Bug: chromium:1492864
Change-Id: I392549abf69f1cfb3754cc847a214513ec9bedc5
---
 test/encode_api_test.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index cf89fd1f24..270be3679e 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -409,10 +409,12 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
 }
 
 #if CONFIG_VP9_ENCODER
+// Frame size needed to trigger the overflow exceeds the max buffer allowed on
+// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY
 #if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
 TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
-  constexpr int kWidth = 16383;
-  constexpr int kHeight = 16383;
+  constexpr int kWidth = 12383;
+  constexpr int kHeight = 8192;
   constexpr auto *iface = &vpx_codec_vp9_cx_algo;
   SCOPED_TRACE(vpx_codec_iface_name(iface));
   vpx_codec_enc_cfg_t cfg = {};
@@ -425,10 +427,11 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
   // The following setting will cause avg_frame_bandwidth in rate control to be
   // larger than INT_MAX
   cfg.rc_target_bitrate = INT_MAX;
+  // Framerate 0.1 (equivalent to timebase 10) is the smallest framerate allowed
+  // by libvpx
   cfg.g_timebase.den = 1;
   cfg.g_timebase.num = 10;
-  EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg));
-  EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+  EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg))
       << "target bitrate: " << cfg.rc_target_bitrate << " framerate: "
       << static_cast<double>(cfg.g_timebase.den) / cfg.g_timebase.num;
 }

From 6457f065290f8114930204df33957388758c7a43 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 24 Oct 2023 08:37:27 +0100
Subject: [PATCH 853/926] Add Arm SVE build flags and run-time CPU feature
 detection

Add 'sve' arch options to the configure, build and unit test files -
adding appropriate conditional options where necessary. Arm SIMD
extensions are treated as supersets in libvpx, so disable SVE if
either Neon DotProd or I8MM are unavailable.

Change-Id: I39dd24f2b209251084d1e28d7ac68099460309bb
---
 build/make/Makefile           |  2 ++
 build/make/rtcd.pl            |  2 +-
 configure                     |  1 +
 test/init_vpx_test.cc         |  3 +++
 vpx_ports/aarch64_cpudetect.c | 25 ++++++++++++++++++++++++-
 vpx_ports/arm.h               |  2 ++
 6 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/build/make/Makefile b/build/make/Makefile
index c2dc47ccff..199ed78058 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -148,6 +148,8 @@ $(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod
 $(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod
 $(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
 $(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
 
 # POWER
 $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 1a6b93d5ae..0b9e16738e 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -487,7 +487,7 @@ ()
   @ALL_ARCHS = filter(qw/neon_asm neon/);
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm/);
+  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/);
   @REQUIRES = filter(qw/neon/);
   &require(@REQUIRES);
   arm;
diff --git a/configure b/configure
index 2c638e5e5a..434c43792c 100755
--- a/configure
+++ b/configure
@@ -257,6 +257,7 @@ ARCH_EXT_LIST_AARCH64="
     neon
     neon_dotprod
     neon_i8mm
+    sve
 "
 
 ARCH_EXT_LIST_X86="
diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc
index e88c54f323..f66f00b5c1 100644
--- a/test/init_vpx_test.cc
+++ b/test/init_vpx_test.cc
@@ -54,6 +54,9 @@ void init_vpx_test() {
   if (!(caps & HAS_NEON_I8MM)) {
     append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*");
   }
+  if (!(caps & HAS_SVE)) {
+    append_negative_gtest_filter(":SVE.*:SVE/*");
+  }
 #elif VPX_ARCH_ARM
   const int caps = arm_cpu_caps();
   if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
index ac68f44452..f56d5888ba 100644
--- a/vpx_ports/aarch64_cpudetect.c
+++ b/vpx_ports/aarch64_cpudetect.c
@@ -77,7 +77,7 @@ static int arm_get_cpu_caps(void) {
   }
 #endif  // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
 #endif  // HAVE_NEON_DOTPROD
-  // No I8MM feature detection available on Windows at time of writing.
+  // No I8MM or SVE feature detection available on Windows at time of writing.
   return flags;
 }
 
@@ -98,6 +98,7 @@ static int arm_get_cpu_caps(void) {
 // Define hwcap values ourselves: building with an old auxv header where these
 // hwcap values are not defined should not prevent features from being enabled.
 #define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define VPX_AARCH64_HWCAP_SVE (1 << 22)
 #define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
 
 static int arm_get_cpu_caps(void) {
@@ -117,6 +118,11 @@ static int arm_get_cpu_caps(void) {
     flags |= HAS_NEON_I8MM;
   }
 #endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (hwcap & VPX_AARCH64_HWCAP_SVE) {
+    flags |= HAS_SVE;
+  }
+#endif  // HAVE_SVE
   return flags;
 }
 
@@ -129,6 +135,10 @@ static int arm_get_cpu_caps(void) {
 #ifndef ZX_ARM64_FEATURE_ISA_I8MM
 #define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
 #endif
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083.
+#ifndef ZX_ARM64_FEATURE_ISA_SVE
+#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20))
+#endif
 
 static int arm_get_cpu_caps(void) {
   int flags = 0;
@@ -150,6 +160,11 @@ static int arm_get_cpu_caps(void) {
     flags |= HAS_NEON_I8MM;
   }
 #endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (features & ZX_ARM64_FEATURE_ISA_SVE) {
+    flags |= HAS_SVE;
+  }
+#endif  // HAVE_SVE
   return flags;
 }
 
@@ -170,5 +185,13 @@ int arm_cpu_caps(void) {
     flags &= ~HAS_NEON_I8MM;
   }
 
+  // Restrict flags: FEAT_SVE assumes that FEAT_{DotProd,I8MM} are available.
+  if (!(flags & HAS_NEON_DOTPROD)) {
+    flags &= ~HAS_SVE;
+  }
+  if (!(flags & HAS_NEON_I8MM)) {
+    flags &= ~HAS_SVE;
+  }
+
   return flags;
 }
diff --git a/vpx_ports/arm.h b/vpx_ports/arm.h
index 65909d8260..39365d18ee 100644
--- a/vpx_ports/arm.h
+++ b/vpx_ports/arm.h
@@ -23,6 +23,8 @@ extern "C" {
 #define HAS_NEON_DOTPROD (1 << 1)
 // Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
 #define HAS_NEON_I8MM (1 << 2)
+// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
+#define HAS_SVE (1 << 3)
 
 int arm_cpu_caps(void);
 

From b759032a0ed2b57ea3412f6820eda377a2dad480 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Fri, 27 Oct 2023 09:24:06 -0700
Subject: [PATCH 854/926] Clear some clang-tidy complaints on header includes

Change-Id: Id6f54dc4643172f6a5576dc4846c47c8eda31c0f
---
 vp8/encoder/encodeframe.c | 6 ++++--
 vp8/encoder/ethreading.c  | 1 +
 vp9/encoder/vp9_encoder.c | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 3f0319a54b..5c973940ec 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdio.h>
+#include <limits.h>
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@@ -29,9 +31,9 @@
 #include "rdopt.h"
 #include "pickinter.h"
 #include "vp8/common/findnearmv.h"
-#include <stdio.h>
-#include <limits.h>
 #include "vp8/common/invtrans.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
 #include "bitstream.h"
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 9993905567..e2f8b89d46 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stddef.h>
 
 #include "onyx_int.h"
 #include "vp8/common/threading.h"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 993e6310eb..cac35a97ef 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -24,6 +24,7 @@
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_once.h"
@@ -33,6 +34,7 @@
 #endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_idct.h"
 #if CONFIG_VP9_POSTPROC

From 61c927a4ede8b43b29091e3ee9f993eea3b3156a Mon Sep 17 00:00:00 2001
From: Xiahong Bao <xiahong.bao@nxp.com>
Date: Sat, 28 Oct 2023 08:52:04 +0900
Subject: [PATCH 855/926] calc_pframe_target_size: fix integer overflow

The intermediate value in the target bandwidth
calculation may exceed integer bounds.

Bug: 308007926

Change-Id: I8288c5820db06a550d88bf91fccc86106996deaa
Signed-off-by: Xiahong Bao <xiahong.bao@nxp.com>
---
 vp8/encoder/ratectrl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 6f14322fdc..fcd4eb04eb 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -719,7 +719,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
         }
 
         /* lower the target bandwidth for this frame. */
-        cpi->this_frame_target -= (cpi->this_frame_target * percent_low) / 200;
+        cpi->this_frame_target -=
+            (int)(((int64_t)cpi->this_frame_target * percent_low) / 200);
 
         /* Are we using allowing control of active_worst_allowed_q
          * according to buffer level.

From 3f3576098ffcc6cf5b44835e2fc1414c227de6cd Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 30 Oct 2023 15:24:12 +0000
Subject: [PATCH 856/926] Fix 'unused variable' warning when neon_i8mm is
 disabled

Guard hwcap2 feature interrogation on HAVE_NEON_I8MM so that it gets
disabled if neon_i8mm is disabled when configuring the build.

Bug: webm:1825
Change-Id: Ic6ff71f17387b96219591928a583d43560bb7c7a
---
 vpx_ports/aarch64_cpudetect.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
index f56d5888ba..539d09bb39 100644
--- a/vpx_ports/aarch64_cpudetect.c
+++ b/vpx_ports/aarch64_cpudetect.c
@@ -104,7 +104,9 @@ static int arm_get_cpu_caps(void) {
 static int arm_get_cpu_caps(void) {
   int flags = 0;
   unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON_I8MM
   unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#endif  // HAVE_NEON_I8MM
 #if HAVE_NEON
   flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
 #endif  // HAVE_NEON

From 0d3ef6ffd22bda0ba1ec1bf9c7a24852e4a1d111 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 24 Oct 2023 11:03:56 -0700
Subject: [PATCH 857/926] vp9-RC: Add drop_frame support to external RC

Supports single layer and svc. For svc only the
framedrop_mode = FULL_SUPERFRAME_DROP is allowed
for now.

Dropping frames due to overshoot is enabled by the
oxcf->drop_frames_water_mark, which is zero as default.
Note that this CL also allows for drop/skip encoding of
enhancement layers if that layer bitrate is zero.

max_consec_drop is also added, set to INT_MAX as default.
Note that max_consec_drop is only used for svc mode.
It has not been added yet for single layer in libvpx encoder.

Tests added for single layer and svc case.

Change-Id: Ic12f6a0eb3fbf07d8eb8456c46cec27b2e1930d3
---
 test/vp9_ratectrl_rtc_test.cc      | 104 ++++++++++++++++++++++++++---
 vp9/encoder/vp9_encoder.c          |  21 +-----
 vp9/encoder/vp9_ratectrl.c         |   4 +-
 vp9/encoder/vp9_ratectrl.h         |   2 +
 vp9/encoder/vp9_svc_layercontext.c |  30 ++++++++-
 vp9/encoder/vp9_svc_layercontext.h |   2 +
 vp9/ratectrl_rtc.cc                |  52 ++++++++++++++-
 vp9/ratectrl_rtc.h                 |  14 +++-
 vpx/internal/vpx_ratectrl_rtc.h    |   2 +
 9 files changed, 192 insertions(+), 39 deletions(-)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index b76fd3624c..ff718bbaa7 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -31,6 +31,7 @@ const int kTemporalId2Layer[2] = { 0, 1 };
 const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 };
 const int kTemporalRateAllocation2Layer[2] = { 60, 100 };
 const int kSpatialLayerBitrate[3] = { 200, 400, 1000 };
+const int kSpatialLayerBitrateLow[3] = { 50, 100, 400 };
 
 class RcInterfaceTest
     : public ::libvpx_test::EncoderTest,
@@ -38,7 +39,7 @@ class RcInterfaceTest
  public:
   RcInterfaceTest()
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
-        encoder_exit_(false) {}
+        encoder_exit_(false), frame_drop_thresh_(0), num_drops_(0) {}
 
   ~RcInterfaceTest() override = default;
 
@@ -76,9 +77,12 @@ class RcInterfaceTest
     int loopfilter_level, qp;
     encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
     encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
-    rc_api_->ComputeQP(frame_params_);
-    ASSERT_EQ(rc_api_->GetQP(), qp);
-    ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+    } else {
+      num_drops_++;
+    }
   }
 
   void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
@@ -97,6 +101,29 @@ class RcInterfaceTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunOneLayerDropFramesCBR() {
+    if (GET_PARAM(2) != VPX_CBR) {
+      GTEST_SKIP() << "Frame dropping is only for CBR mode.";
+    }
+    frame_drop_thresh_ = 30;
+    SetConfig(GET_PARAM(2));
+    // Use lower bitrate, lower max-q, and enable frame dropper.
+    rc_cfg_.target_bandwidth = 200;
+    cfg_.rc_target_bitrate = 200;
+    rc_cfg_.max_quantizer = 50;
+    cfg_.rc_max_quantizer = 50;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
   void RunOneLayerVBRPeriodicKey() {
     if (GET_PARAM(2) != VPX_VBR) return;
     key_interval_ = 100;
@@ -134,6 +161,7 @@ class RcInterfaceTest
     rc_cfg_.min_quantizers[0] = 2;
     rc_cfg_.rc_mode = rc_mode;
     rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
 
     // Encoder settings for ground truth.
     cfg_.g_w = 1280;
@@ -152,6 +180,7 @@ class RcInterfaceTest
     cfg_.rc_target_bitrate = 1000;
     cfg_.kf_min_dist = key_interval_;
     cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
   }
 
   std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
@@ -160,6 +189,8 @@ class RcInterfaceTest
   int key_interval_;
   libvpx::VP9FrameParamsQpRTC frame_params_;
   bool encoder_exit_;
+  int frame_drop_thresh_;
+  int num_drops_;
 };
 
 class RcInterfaceSvcTest
@@ -169,7 +200,8 @@ class RcInterfaceSvcTest
   RcInterfaceSvcTest()
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
         dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)),
-        parallel_spatial_layers_(false) {}
+        parallel_spatial_layers_(false), frame_drop_thresh_(0),
+        max_consec_drop_(INT_MAX), num_drops_(0) {}
   ~RcInterfaceSvcTest() override = default;
 
  protected:
@@ -181,6 +213,7 @@ class RcInterfaceSvcTest
   void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
+      current_superframe_ = 0;
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
       encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
@@ -192,12 +225,19 @@ class RcInterfaceSvcTest
         encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED,
                          INTER_LAYER_PRED_OFF_NONKEY);
       }
+      if (frame_drop_thresh_ > 0) {
+        vpx_svc_frame_drop_t svc_drop_frame;
+        svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP;
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl)
+          svc_drop_frame.framedrop_thresh[sl] = frame_drop_thresh_;
+        svc_drop_frame.max_consec_drop = max_consec_drop_;
+        encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
+      }
     }
     frame_params_.frame_type = video->frame() % key_interval_ == 0
                                    ? libvpx::RcFrameType::kKeyFrame
                                    : libvpx::RcFrameType::kInterFrame;
     encoder_exit_ = video->frame() == kNumFrames;
-    current_superframe_ = video->frame();
     if (dynamic_spatial_layers_ == 1) {
       if (video->frame() == 100) {
         // Go down to 2 spatial layers: set top SL to 0 bitrate.
@@ -257,24 +297,38 @@ class RcInterfaceSvcTest
   }
 
   void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    int superframe_is_dropped = false;
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
     for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
     std::vector<int> rc_qp;
+    // For FULL_SUPERFRAME_DROP: the full superframe drop decision is
+    // determined on the base spatial layer.
+    SetFrameParamsSvc(0);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kDrop) {
+      superframe_is_dropped = true;
+      num_drops_++;
+    }
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      ASSERT_EQ(superframe_is_dropped, false);
       ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
                            pkt->data.frame.sz);
       if (!parallel_spatial_layers_ || current_superframe_ == 0) {
         for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
           if (sizes_[sl] > 0) {
             SetFrameParamsSvc(sl);
-            rc_api_->ComputeQP(frame_params_);
+            // For sl=0 ComputeQP() is already called above (line 310).
+            if (sl > 0) rc_api_->ComputeQP(frame_params_);
             rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
             rc_qp.push_back(rc_api_->GetQP());
           }
         }
       } else {
         for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
-          if (sizes_[sl] > 0) {
+          // For sl=0 ComputeQP() is already called above (line 310).
+          if (sizes_[sl] > 0 && sl > 0) {
             SetFrameParamsSvc(sl);
             rc_api_->ComputeQP(frame_params_);
           }
@@ -288,7 +342,7 @@ class RcInterfaceSvcTest
         }
       }
     }
-    if (!encoder_exit_) {
+    if (!superframe_is_dropped) {
       int loopfilter_level;
       std::vector<int> encoder_qp(VPX_SS_MAX_LAYERS, 0);
       encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
@@ -296,6 +350,7 @@ class RcInterfaceSvcTest
       encoder_qp.resize(rc_qp.size());
       ASSERT_EQ(rc_qp, encoder_qp);
       ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+      current_superframe_++;
     }
   }
   // This method needs to be overridden because non-reference frames are
@@ -315,6 +370,21 @@ class RcInterfaceSvcTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunSvcDropFramesCBR() {
+    max_consec_drop_ = 10;
+    frame_drop_thresh_ = 30;
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
   void RunSvcPeriodicKey() {
     SetRCConfigSvc(3, 3);
     key_interval_ = 100;
@@ -438,12 +508,14 @@ class RcInterfaceSvcTest
     cfg_.kf_max_dist = 9999;
     cfg_.rc_overshoot_pct = 50;
     cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
 
     cfg_.rc_target_bitrate = 0;
     for (int sl = 0; sl < number_spatial_layers; sl++) {
       int spatial_bitrate = 0;
       if (number_spatial_layers <= 3)
-        spatial_bitrate = kSpatialLayerBitrate[sl];
+        spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl]
+                                                 : kSpatialLayerBitrate[sl];
       for (int tl = 0; tl < number_temporal_layers; tl++) {
         int layer = sl * number_temporal_layers + tl;
         if (number_temporal_layers == 3)
@@ -478,6 +550,8 @@ class RcInterfaceSvcTest
     rc_cfg_.framerate = 30.0;
     rc_cfg_.rc_mode = VPX_CBR;
     rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+    rc_cfg_.max_consec_drop = max_consec_drop_;
 
     if (number_spatial_layers == 3) {
       rc_cfg_.scaling_factor_num[0] = 1;
@@ -511,7 +585,8 @@ class RcInterfaceSvcTest
     for (int sl = 0; sl < number_spatial_layers; sl++) {
       int spatial_bitrate = 0;
       if (number_spatial_layers <= 3)
-        spatial_bitrate = kSpatialLayerBitrate[sl];
+        spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl]
+                                                 : kSpatialLayerBitrate[sl];
       for (int tl = 0; tl < number_temporal_layers; tl++) {
         int layer = sl * number_temporal_layers + tl;
         if (number_temporal_layers == 3)
@@ -548,14 +623,21 @@ class RcInterfaceSvcTest
   bool inter_layer_pred_off_;
   // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC.
   bool parallel_spatial_layers_;
+  int frame_drop_thresh_;
+  int max_consec_drop_;
+  int num_drops_;
 };
 
 TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
+TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); }
+
 TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
+TEST_P(RcInterfaceSvcTest, SvcDropFramesCBR) { RunSvcDropFramesCBR(); }
+
 TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) {
   RunSvcParallelSpatialLayers();
 }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index cac35a97ef..e1a4d986b7 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5517,26 +5517,7 @@ static void encode_frame_to_data_rate(
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
 
-  // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
-  // No need to set svc.skip_enhancement_layer if whole superframe will be
-  // dropped.
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
-      cpi->oxcf.target_bandwidth == 0 &&
-      !(cpi->svc.framedrop_mode != LAYER_DROP &&
-        (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
-         cpi->svc
-             .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
-                                                1]) &&
-        cpi->svc.drop_spatial_layer[0])) {
-    cpi->svc.skip_enhancement_layer = 1;
-    vp9_rc_postencode_update_drop_frame(cpi);
-    cpi->ext_refresh_frame_flags_pending = 0;
-    cpi->last_frame_dropped = 1;
-    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
-    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
-    vp9_inc_frame_in_layer(cpi);
-    return;
-  }
+  if (vp9_svc_check_skip_enhancement_layer(cpi)) return;
 
   set_ext_overrides(cpi);
   vpx_clear_system_state();
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 7f4761dfab..e02b2892ac 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -260,7 +260,7 @@ void vp9_update_buffer_level_preencode(VP9_COMP *cpi) {
 // for the layered rate control which involves cumulative buffer levels for
 // the temporal layers. Allow for using the timestamp(pts) delta for the
 // framerate when the set_ref_frame_config is used.
-static void update_buffer_level_svc_preencode(VP9_COMP *cpi) {
+void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
   int i;
   // Set this to 1 to use timestamp delta for "framerate" under
@@ -2445,7 +2445,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
     vp9_cyclic_refresh_update_parameters(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
-  if (cm->show_frame) update_buffer_level_svc_preencode(cpi);
+  if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi);
 
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
       svc->spatial_layer_id == svc->first_spatial_layer_to_encode &&
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 96a8fd3f1d..48c49e937e 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -350,6 +350,8 @@ void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
 
 void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
 
+void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 0df34bf459..fff6d25de0 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -223,11 +223,11 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
           bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
         }
         lrc->starting_buffer_level =
-            (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+            (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5);
         lrc->optimal_buffer_level =
-            (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+            (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5);
         lrc->maximum_buffer_size =
-            (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+            (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5);
         lrc->bits_off_target =
             VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
         lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
@@ -1350,3 +1350,27 @@ void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) {
     }
   }
 }
+
+// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
+// No need to set svc.skip_enhancement_layer if whole superframe will be
+// dropped.
+int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) {
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      cpi->oxcf.target_bandwidth == 0 &&
+      !(cpi->svc.framedrop_mode != LAYER_DROP &&
+        (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
+         cpi->svc
+             .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
+                                                1]) &&
+        cpi->svc.drop_spatial_layer[0])) {
+    cpi->svc.skip_enhancement_layer = 1;
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    cpi->last_frame_dropped = 1;
+    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
+    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
+    vp9_inc_frame_in_layer(cpi);
+    return 1;
+  }
+  return 0;
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 90dec5e20a..388a02789d 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -281,6 +281,8 @@ void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi);
 void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi);
 
 void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi);
+
+int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index d92b095714..d8239718a8 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -130,6 +130,7 @@ bool VP9RateControlRTC::UpdateRateControl(
   oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz;
   oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
   oxcf->ss_number_layers = rc_cfg.ss_number_layers;
   oxcf->ts_number_layers = rc_cfg.ts_number_layers;
   oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)(
@@ -172,9 +173,15 @@ bool VP9RateControlRTC::UpdateRateControl(
   vp9_new_framerate(cpi_, cpi_->framerate);
   if (cpi_->svc.number_temporal_layers > 1 ||
       cpi_->svc.number_spatial_layers > 1) {
-    if (cm->current_video_frame == 0) vp9_init_layer_context(cpi_);
+    if (cm->current_video_frame == 0) {
+      vp9_init_layer_context(cpi_);
+      // svc->framedrop_mode is not currently exposed, so only allow for
+      // full superframe drop for now.
+      cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP;
+    }
     vp9_update_layer_context_change_config(cpi_,
                                            (int)cpi_->oxcf.target_bandwidth);
+    cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop;
   }
   vp9_check_reset_rc_flag(cpi_);
 
@@ -182,7 +189,11 @@ bool VP9RateControlRTC::UpdateRateControl(
   return true;
 }
 
-void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
+// Compute the QP for the frame. If the frame is dropped this function
+// returns kDrop, and no QP is computed. If the frame is encoded (not dropped)
+// the QP is computed and kOk is returned.
+FrameDropDecision VP9RateControlRTC::ComputeQP(
+    const VP9FrameParamsQpRTC &frame_params) {
   VP9_COMMON *const cm = &cpi_->common;
   int width, height;
   cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
@@ -234,6 +245,36 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
     vp9_restore_layer_context(cpi_);
     vp9_rc_get_svc_params(cpi_);
   }
+  if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer);
+  // SVC: check for skip encoding of enhancement layer if the
+  // layer target bandwidth = 0.
+  if (vp9_svc_check_skip_enhancement_layer(cpi_))
+    return FrameDropDecision::kDrop;
+  // Check for dropping this frame based on buffer level.
+  // Never drop on key frame, or if base layer is key for svc,
+  if (!frame_is_intra_only(cm) &&
+      (!cpi_->use_svc ||
+       !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) {
+    if (vp9_rc_drop_frame(cpi_)) {
+      // For FULL_SUPERFRAME_DROP mode (the only mode considered here):
+      // if the superframe drop is decided we need to save the layer context for
+      // all spatial layers, and call update_buffer_level and postencode_drop
+      // for all spatial layers.
+      if (cpi_->svc.number_spatial_layers > 1 ||
+          cpi_->svc.number_temporal_layers > 1) {
+        vp9_save_layer_context(cpi_);
+        for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) {
+          cpi_->svc.spatial_layer_id = sl;
+          vp9_restore_layer_context(cpi_);
+          vp9_update_buffer_level_svc_preencode(cpi_);
+          vp9_rc_postencode_update_drop_frame(cpi_);
+          vp9_save_layer_context(cpi_);
+        }
+      }
+      return FrameDropDecision::kDrop;
+    }
+  }
+  // Compute the QP for the frame.
   int bottom_index, top_index;
   cpi_->common.base_qindex =
       vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index);
@@ -242,6 +283,13 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
   if (cpi_->svc.number_spatial_layers > 1 ||
       cpi_->svc.number_temporal_layers > 1)
     vp9_save_layer_context(cpi_);
+
+  cpi_->last_frame_dropped = 0;
+  cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0;
+  if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+    cpi_->svc.num_encoded_top_layer++;
+
+  return FrameDropDecision::kOk;
 }
 
 int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; }
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index d3876de875..a8dd5c42ff 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -38,6 +38,7 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
     scaling_factor_den[0] = 1;
     max_quantizers[0] = max_quantizer;
     min_quantizers[0] = min_quantizer;
+    max_consec_drop = INT_MAX;
   }
 
   // Number of spatial layers
@@ -46,6 +47,8 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
   int min_quantizers[VPX_MAX_LAYERS];
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
+  // This is only for SVC for now.
+  int max_consec_drop;
 };
 
 struct VP9FrameParamsQpRTC {
@@ -61,6 +64,11 @@ struct VP9SegmentationData {
   size_t delta_q_size;
 };
 
+enum class FrameDropDecision {
+  kOk,    // Frame is encoded.
+  kDrop,  // Frame is dropped.
+};
+
 // This interface allows using VP9 real-time rate control without initializing
 // the encoder. To use this interface, you need to link with libvpxrc.a.
 //
@@ -92,7 +100,11 @@ class VP9RateControlRTC {
   int GetQP() const;
   int GetLoopfilterLevel() const;
   bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
-  void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
+  // ComputeQP returns the QP is the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called (vp9_rc_postencode_update_drop_frame is already
+  // called via ComputeQP if drop is decided).
+  FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
   void PostEncodeUpdate(uint64_t encoded_frame_size,
                         const VP9FrameParamsQpRTC &frame_params);
diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h
index 33c57e219b..eb90cd1d0c 100644
--- a/vpx/internal/vpx_ratectrl_rtc.h
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -37,6 +37,7 @@ struct VpxRateControlRtcConfig {
     aq_mode = 0;
     layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
     ts_rate_decimator[0] = 1;
+    frame_drop_thresh = 0;
   }
 
   int width;
@@ -60,6 +61,7 @@ struct VpxRateControlRtcConfig {
   // vbr, cbr
   enum vpx_rc_mode rc_mode;
   int aq_mode;
+  int frame_drop_thresh;
 };
 }  // namespace libvpx
 #endif  // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_

From 1464d7738a49b2b6e2c8e8e9f03b565f4f6c0860 Mon Sep 17 00:00:00 2001
From: Anupam Pandey <anupam.pandey@ittiam.com>
Date: Thu, 5 Oct 2023 11:19:19 +0530
Subject: [PATCH 858/926] Modify C vs SIMD test script

- Enable C vs SIMD test for x86 32-bit platform
- Correct a print message in run_tests()

BUG=webm:1800

Change-Id: Ib1ccd3a87a64b5ec6cde524a14d5d1b7e200abfb
---
 test/tools_common.sh         |  7 ++++++-
 test/vp9_c_vs_simd_encode.sh | 16 +++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/test/tools_common.sh b/test/tools_common.sh
index 0e4a0a5c0e..d0dd24df36 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -280,7 +280,12 @@ run_tests() {
     test_end "${test}"
   done
 
-  local tested_config="$(test_configuration_target) @ $(current_hash)"
+  # C vs SIMD tests are run for x86 32-bit, 64-bit and ARM platform
+  if [ "${test_name}" = "vp9_c_vs_simd_encode" ]; then
+    local tested_config="$(current_hash)"
+  else
+    local tested_config="$(test_configuration_target) @ $(current_hash)"
+  fi
   echo "${test_name}: Done, all tests pass for ${tested_config}."
 }
 
diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
index 42e5f3b589..7cd60543cc 100755
--- a/test/vp9_c_vs_simd_encode.sh
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -378,16 +378,14 @@ vp9_c_vs_simd_enc_test() {
   # Test Generic
   vp9_test_generic
 
-  # TODO(webm:1816): Enable x86 test once issue 1816 is fixed.
-  # Details: https://bugs.chromium.org/p/webm/issues/detail?id=1816
   # Test x86 (32 bit)
-  # echo "vp9 test for x86 (32 bit): Started."
-  # if ! vp9_test_x86 "x86"; then
-  #   echo "vp9 test for x86 (32 bit): Done, test failed."
-  #   return 1
-  # else
-  #   echo "vp9 test for x86 (32 bit): Done, all tests passed."
-  # fi
+  echo "vp9 test for x86 (32 bit): Started."
+  if ! vp9_test_x86 "x86"; then
+    echo "vp9 test for x86 (32 bit): Done, test failed."
+    return 1
+  else
+    echo "vp9 test for x86 (32 bit): Done, all tests passed."
+  fi
 
   # Test x86_64 (64 bit)
   if [ "$(eval uname -m)" = "x86_64" ]; then

From 879c9bd9066527770d2999831501d9aeda0b79ac Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 3 Nov 2023 15:22:04 -0400
Subject: [PATCH 859/926] Check fragments count before use

Bug: webm:1827
Change-Id: I8d603d5db92476222cbee1c61fece957ad03a49f
---
 vp8/vp8_dx_iface.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 9e622e3b97..d4e06a7bce 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -249,14 +249,14 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
     /* Store a pointer to this fragment and return. We haven't
      * received the complete frame yet, so we will wait with decoding.
      */
-    ctx->fragments.ptrs[ctx->fragments.count] = data;
-    ctx->fragments.sizes[ctx->fragments.count] = data_sz;
-    ctx->fragments.count++;
-    if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1) {
+    if (ctx->fragments.count >= MAX_PARTITIONS) {
       ctx->fragments.count = 0;
       *res = VPX_CODEC_INVALID_PARAM;
       return -1;
     }
+    ctx->fragments.ptrs[ctx->fragments.count] = data;
+    ctx->fragments.sizes[ctx->fragments.count] = data_sz;
+    ctx->fragments.count++;
     return 0;
   }
 

From 5b8d24f678560edb545beeffee7668761ad5fa7e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 6 Nov 2023 10:21:56 -0800
Subject: [PATCH 860/926] configure: detect PIE and enable PIC

Fixes the creation of DT_TEXTREL entries in binaries built with PIE
enabled:
  /usr/bin/ld: warning: creating DT_TEXTREL in a PIE

This matches the changes made in libaom:
1df26009da aom_configure: only override CONFIG_PIC if not set on cmd line
7235e65746 aom_configure.cmake: detect PIE and set CONFIG_PIC

Change-Id: I0a43e964af2d8eb8c5e7811ce14ad39285eec3a8
---
 build/make/configure.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 9d3cd80cb3..54fb1daf4d 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -863,8 +863,14 @@ process_common_toolchain() {
       ;;
   esac
 
-  # PIC is probably what we want when building shared libs
+  # Position independent code (PIC) is probably what we want when building
+  # shared libs or position independent executable (PIE) targets.
   enabled shared && soft_enable pic
+  check_cpp << EOF || soft_enable pic
+#if !(__pie__ || __PIE__)
+#error Neither __pie__ or __PIE__ are set
+#endif
+EOF
 
   # Minimum iOS version for all target platforms (darwin and iphonesimulator).
   # Shared library framework builds are only possible on iOS 8 and later.

From c732fa70705a1563d1e92462d0157501efc85718 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 7 Nov 2023 15:47:43 -0800
Subject: [PATCH 861/926] Use symbolic constant VPX_CBR instead of 1

Change-Id: Idae94cfc6d7a882691deeb4fa3ce0015f80ed937
---
 vpxenc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpxenc.c b/vpxenc.c
index 38d69a1923..d20bd3f967 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -883,7 +883,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
 
     /* Default lag_in_frames is 0 in realtime mode CBR mode*/
     if (global->deadline == VPX_DL_REALTIME &&
-        stream->config.cfg.rc_end_usage == 1)
+        stream->config.cfg.rc_end_usage == VPX_CBR)
       stream->config.cfg.g_lag_in_frames = 0;
   }
 

From 7ab673a9f62a26035155bc4e26fc375fe483bb95 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 8 Nov 2023 15:16:13 -0800
Subject: [PATCH 862/926] Fix float-cast-overflow in vp8_change_config()

Bug: b:309716574
Change-Id: I9c523d5e9211f895c7497a9e3674b55f6be6c742
---
 test/encode_api_test.cc | 56 +++++++++++++++++++++++++++++++++++++++++
 vp8/encoder/onyx_if.c   |  8 +++---
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 270be3679e..012e54a33d 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -122,6 +122,62 @@ TEST(EncodeAPI, ImageSizeSetting) {
 
   vpx_codec_destroy(&enc);
 }
+
+// Verifies the fix for a float-cast-overflow in vp8_change_config().
+//
+// Causes cpi->framerate to become the largest possible value (10,000,000) in
+// VP8 by setting cfg.g_timebase to 1/10000000 and passing a duration of 1 to
+// vpx_codec_encode().
+TEST(EncodeAPI, HugeFramerateVp8) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  cfg.g_w = 271;
+  cfg.g_h = 1080;
+  cfg.g_timebase.num = 1;
+  // Largest value (VP8's TICKS_PER_SEC) such that frame duration is nonzero (1
+  // tick).
+  cfg.g_timebase.den = 10000000;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = VPX_CBR;
+
+  vpx_codec_ctx_t enc;
+  // Before we encode the first frame, cpi->framerate is set to a guess (the
+  // reciprocal of cfg.g_timebase). If this guess doesn't seem reasonable
+  // (> 180), cpi->framerate is set to 30.
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -12), VPX_CODEC_OK);
+
+  vpx_image_t *const image =
+      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1);
+  ASSERT_NE(image, nullptr);
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+
+  // Encode a frame.
+  const unsigned long deadline = VPX_DL_REALTIME;
+  // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This
+  // causes cpi->framerate to become 10,000,000.
+  ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, deadline), VPX_CODEC_OK);
+
+  // Change to the same config. Since cpi->framerate is now huge, when it is
+  // used to calculate raw_target_rate (bit rate of uncompressed frames), the
+  // result is likely to overflow an unsigned int.
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
 #endif
 
 // Set up 2 spatial streams with 2 temporal layers per stream, and generate
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 12a99584f9..79140864e7 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1416,7 +1416,7 @@ void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
   int last_w, last_h;
   unsigned int prev_number_of_layers;
-  unsigned int raw_target_rate;
+  double raw_target_rate;
 
   if (!cpi) return;
 
@@ -1557,10 +1557,10 @@ void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
     cpi->oxcf.maximum_buffer_size_in_ms = 240000;
   }
 
-  raw_target_rate = (unsigned int)((int64_t)cpi->oxcf.Width * cpi->oxcf.Height *
-                                   8 * 3 * cpi->framerate / 1000);
+  raw_target_rate = ((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * 8 * 3 *
+                     cpi->framerate / 1000.0);
   if (cpi->oxcf.target_bandwidth > raw_target_rate)
-    cpi->oxcf.target_bandwidth = raw_target_rate;
+    cpi->oxcf.target_bandwidth = (unsigned int)raw_target_rate;
   /* Convert target bandwidth from Kbit/s to Bit/s */
   cpi->oxcf.target_bandwidth *= 1000;
 

From 4e05c38c85fd3f72e167bbf8bb82816bc45393a6 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 9 Nov 2023 10:19:47 -0800
Subject: [PATCH 863/926] Document the units of VP8 target_bandwidth/bitrate

Change-Id: I6298a0acb4ef546ae198bb1f16dea50ed34b2dae
---
 vp8/common/onyx.h      | 10 +++++++++-
 vp8/encoder/onyx_int.h |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 7f7f567c6a..96cd2fe59e 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -83,7 +83,14 @@ typedef struct {
   int Width;
   int Height;
   struct vpx_rational timebase;
-  unsigned int target_bandwidth; /* kilobits per second */
+  /* In either kilobits per second or bits per second, depending on which
+   * copy of oxcf this is in.
+   * - ctx->oxcf.target_bandwidth is in kilobits per second. See
+   *   set_vp8e_config().
+   * - ctx->cpi->oxcf.target_bandwidth in is bits per second. See
+   *   vp8_change_config().
+   */
+  unsigned int target_bandwidth;
 
   /* Parameter used for applying denoiser.
    * For temporal denoiser: noise_sensitivity = 0 means off,
@@ -214,6 +221,7 @@ typedef struct {
 
   /* Temporal scaling parameters */
   unsigned int number_of_layers;
+  /* kilobits per second */
   unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
   unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
   unsigned int periodicity;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 2f06702a1d..cdf94f4f23 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -215,7 +215,7 @@ enum {
 typedef struct {
   /* Layer configuration */
   double framerate;
-  int target_bandwidth;
+  int target_bandwidth; /* bits per second */
 
   /* Layer specific coding parameters */
   int64_t starting_buffer_level;
@@ -438,7 +438,7 @@ typedef struct VP8_COMP {
   int kf_boost;
   int last_boost;
 
-  int target_bandwidth;
+  int target_bandwidth; /* bits per second */
   struct vpx_codec_pkt_list *output_pkt_list;
 
 #if 0

From 296784c83afd2eacce61f6dc94a003b383c90d01 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 9 Nov 2023 12:42:58 -0800
Subject: [PATCH 864/926] Declare oxcf arg of vp8_create_compressor as const

Declare the oxcf parameters of vp8_create_compressor() and init_config()
as const. This helps code analysis.

Change-Id: I344ef3e6afc3adced2b2865b7e0057c6d4b1d3c0
---
 vp8/common/onyx.h     | 2 +-
 vp8/encoder/onyx_if.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 7f7f567c6a..c363c1c81b 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -236,7 +236,7 @@ typedef struct {
 
 void vp8_initialize();
 
-struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf);
+struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
 
 void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 12a99584f9..ed243ef951 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1290,7 +1290,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) {
   }
 }
 
-static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
+static void init_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
@@ -1739,7 +1739,7 @@ static void cal_mvsadcosts(int *mvsadcost[2]) {
   } while (++i <= mvfp_max);
 }
 
-struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
+struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
   int i;
 
   VP8_COMP *cpi;

From f05122d35cf9b11d309c412157e0f250426f6de4 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 9 Nov 2023 18:35:51 -0800
Subject: [PATCH 865/926] Fix ClangTidy warnings

Most are related to include-what-you-use. One is to avoid using the
unsigned long type explicitly (by passing VPX_DL_REALTIME directly to
vpx_codec_encode).

Change-Id: Ieaf3418382ad8516cb4b172f7678893286fcb8cf
---
 test/encode_api_test.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 012e54a33d..770052c859 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -21,6 +21,9 @@
 
 #include "./vpx_config.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_image.h"
 #include "vpx/vpx_tpl.h"
 
 namespace {
@@ -165,10 +168,10 @@ TEST(EncodeAPI, HugeFramerateVp8) {
   }
 
   // Encode a frame.
-  const unsigned long deadline = VPX_DL_REALTIME;
   // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This
   // causes cpi->framerate to become 10,000,000.
-  ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, deadline), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_REALTIME),
+            VPX_CODEC_OK);
 
   // Change to the same config. Since cpi->framerate is now huge, when it is
   // used to calculate raw_target_rate (bit rate of uncompressed frames), the

From d15a1970c153fe85761ccb88441832ba856aec1e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 8 Nov 2023 21:05:15 -0800
Subject: [PATCH 866/926] Delete -Wdeclaration-after-statement

Older versions of MSVC do not allow declarations after statements in C
files. We don't need to support those versions of MSVC now.

Use -std=gnu99 instead of -std=gnu89.

Change-Id: I76ba962f5a2bca30d6a5b2b05c5786507398ad32
---
 CHANGELOG | 5 +++++
 configure | 6 ++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index f932f6bf4d..5a8605a73d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,8 @@
+20yy-mm-dd v1.14.0 "V Duck"
+  This release drops support for old C compilers, such as Visual Studio 2012
+  and older, that disallow mixing variable declarations and statements (a C99
+  feature).
+
 2023-09-29 v1.13.1 "Ugly Duckling"
   This release contains two security related fixes. One each for VP8 and VP9.
 
diff --git a/configure b/configure
index 434c43792c..6b910160a8 100755
--- a/configure
+++ b/configure
@@ -643,7 +643,6 @@ process_toolchain() {
     if enabled gcc; then
         enabled werror && check_add_cflags -Werror
         check_add_cflags -Wall
-        check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
         check_add_cflags -Wextra-semi
         check_add_cflags -Wextra-semi-stmt
@@ -670,9 +669,8 @@ process_toolchain() {
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
         fi
-        # Enforce c89 for c files. Don't be too strict about it though. Allow
-        # gnu extensions like "//" for comments.
-        check_cflags -std=gnu89 && add_cflags_only -std=gnu89
+        # Enforce C99 for C files. Allow GNU extensions.
+        check_cflags -std=gnu99 && add_cflags_only -std=gnu99
         # Avoid this warning for third_party C++ sources. Some reorganization
         # would be needed to apply this only to test/*.cc.
         check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32

From e4127f591de35c2e5dd62704c345716a3e7e3706 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 10 Nov 2023 13:14:18 -0800
Subject: [PATCH 867/926] Document how VP9 treats a negative speed value

Change-Id: I12948b08a7bb5beb5024b8676de9dafc239f8e89
---
 vpx/vp8cx.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 3c0278c848..d098c4c985 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -166,6 +166,7 @@ enum vp8e_enc_control_id {
    *
    * \note Valid range for VP8: -16..16
    * \note Valid range for VP9: -9..9
+   * \note A negative value (-n) is treated as its absolute value (n) in VP9.
    *
    * Supported in codecs: VP8, VP9
    */

From 81aaa7f04b7644ac80960d17442199607195b24c Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Sun, 12 Nov 2023 00:39:14 -0800
Subject: [PATCH 868/926] rtc: Add frame dropper to VP8 external RC

Move some internal drop_frame code to separate
function so the external RC can use.
And add new flag setting under VP8E_SET_RTC_EXTERNAL_RATECTRL
to disable vp8_drop_encodedframe_overshoot() for
testing the external RC.

Unittest added for single layer and 3 temporal layers.

Bug: b/280363228

Change-Id: Ibea2f627cc54e7156ff35259a64dd111d42d146c
---
 test/vp8_ratectrl_rtc_test.cc |  75 ++++++++++--
 vp8/encoder/onyx_if.c         | 215 ++++++++++++++++++----------------
 vp8/encoder/onyx_int.h        |   6 +
 vp8/vp8_cx_iface.c            |   3 +-
 vp8/vp8_ratectrl_rtc.cc       |  21 +++-
 vp8/vp8_ratectrl_rtc.h        |  10 +-
 6 files changed, 212 insertions(+), 118 deletions(-)

diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index 81f06d90ad..9fbc1d4d98 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -52,7 +52,8 @@ class Vp8RcInterfaceTest
       public ::libvpx_test::CodecTestWith2Params<int, Vp8RCTestVideo> {
  public:
   Vp8RcInterfaceTest()
-      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {}
+      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false),
+        frame_drop_thresh_(0) {}
   ~Vp8RcInterfaceTest() override = default;
 
  protected:
@@ -145,8 +146,11 @@ class Vp8RcInterfaceTest
     }
     int qp;
     encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
-    rc_api_->ComputeQP(frame_params_);
-    ASSERT_EQ(rc_api_->GetQP(), qp);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+    } else {
+      num_drops_++;
+    }
   }
 
   void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
@@ -156,8 +160,6 @@ class Vp8RcInterfaceTest
   void RunOneLayer() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
-    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
-    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfig();
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
     ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
@@ -169,12 +171,33 @@ class Vp8RcInterfaceTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunOneLayerDropFrames() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    frame_drop_thresh_ = 30;
+    num_drops_ = 0;
+    // Use lower target_bitrate and max_quantizer to trigger drops.
+    target_bitrate_ = target_bitrate_ >> 2;
+    SetConfig();
+    rc_cfg_.max_quantizer = 56;
+    cfg_.rc_max_quantizer = 56;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
   void RunPeriodicKey() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
-    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
-    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     key_interval_ = 100;
+    frame_drop_thresh_ = 30;
     SetConfig();
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
     ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
@@ -189,8 +212,6 @@ class Vp8RcInterfaceTest
   void RunTemporalLayers2TL() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
-    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
-    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfigTemporalLayers(2);
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
     ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
@@ -205,8 +226,6 @@ class Vp8RcInterfaceTest
   void RunTemporalLayers3TL() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
-    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
-    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfigTemporalLayers(3);
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
     ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
@@ -218,6 +237,28 @@ class Vp8RcInterfaceTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunTemporalLayers3TLDropFrames() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    frame_drop_thresh_ = 30;
+    num_drops_ = 0;
+    // Use lower target_bitrate and max_quantizer to trigger drops.
+    target_bitrate_ = target_bitrate_ >> 2;
+    SetConfigTemporalLayers(3);
+    rc_cfg_.max_quantizer = 56;
+    cfg_.rc_max_quantizer = 56;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
  private:
   void SetConfig() {
     rc_cfg_.width = test_video_.width;
@@ -233,6 +274,7 @@ class Vp8RcInterfaceTest
     rc_cfg_.max_intra_bitrate_pct = 1000;
     rc_cfg_.framerate = 30.0;
     rc_cfg_.layer_target_bitrate[0] = target_bitrate_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
 
     // Encoder settings for ground truth.
     cfg_.g_w = test_video_.width;
@@ -251,6 +293,7 @@ class Vp8RcInterfaceTest
     cfg_.rc_target_bitrate = target_bitrate_;
     cfg_.kf_min_dist = key_interval_;
     cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
   }
 
   void SetConfigTemporalLayers(int temporal_layers) {
@@ -266,6 +309,7 @@ class Vp8RcInterfaceTest
     rc_cfg_.overshoot_pct = 50;
     rc_cfg_.max_intra_bitrate_pct = 1000;
     rc_cfg_.framerate = 30.0;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
     if (temporal_layers == 2) {
       rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100;
       rc_cfg_.layer_target_bitrate[1] = target_bitrate_;
@@ -299,6 +343,7 @@ class Vp8RcInterfaceTest
     cfg_.rc_target_bitrate = target_bitrate_;
     cfg_.kf_min_dist = key_interval_;
     cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
     // 2 Temporal layers, no spatial layers, CBR mode.
     cfg_.ss_number_layers = 1;
     cfg_.ts_number_layers = temporal_layers;
@@ -326,16 +371,24 @@ class Vp8RcInterfaceTest
   Vp8RCTestVideo test_video_;
   libvpx::VP8FrameParamsQpRTC frame_params_;
   bool encoder_exit_;
+  int frame_drop_thresh_;
+  int num_drops_;
 };
 
 TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
+TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); }
+
 TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
 
 TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); }
 
 TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); }
 
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TLDropFrames) {
+  RunTemporalLayers3TLDropFrames();
+}
+
 VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest,
                            ::testing::Values(200, 400, 1000),
                            ::testing::ValuesIn(kVp8RCTestVectors));
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 15cef5ed33..4e128e3c49 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1899,6 +1899,7 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
   cpi->force_maxqp = 0;
   cpi->frames_since_last_drop_overshoot = 0;
   cpi->rt_always_update_correction_factor = 0;
+  cpi->rt_drop_recode_on_overshoot = 1;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -3183,6 +3184,113 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
 
   vp8_yv12_extend_frame_borders(cm->frame_to_show);
 }
+// Return 1 if frame is to be dropped. Update frame drop decimation
+// counters.
+int vp8_check_drop_buffer(VP8_COMP *cpi) {
+  VP8_COMMON *cm = &cpi->common;
+  int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
+                        cpi->oxcf.optimal_buffer_level / 100);
+  int drop_mark75 = drop_mark * 2 / 3;
+  int drop_mark50 = drop_mark / 4;
+  int drop_mark25 = drop_mark / 8;
+  if (cpi->drop_frames_allowed) {
+    /* The reset to decimation 0 is only done here for one pass.
+     * Once it is set two pass leaves decimation on till the next kf.
+     */
+    if (cpi->buffer_level > drop_mark && cpi->decimation_factor > 0) {
+      cpi->decimation_factor--;
+    }
+
+    if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) {
+      cpi->decimation_factor = 1;
+
+    } else if (cpi->buffer_level < drop_mark25 &&
+               (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) {
+      cpi->decimation_factor = 3;
+    } else if (cpi->buffer_level < drop_mark50 &&
+               (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) {
+      cpi->decimation_factor = 2;
+    } else if (cpi->buffer_level < drop_mark75 &&
+               (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) {
+      cpi->decimation_factor = 1;
+    }
+  }
+
+  /* The following decimates the frame rate according to a regular
+   * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
+   * prevent buffer under-run in CBR mode. Alternatively it might be
+   * desirable in some situations to drop frame rate but throw more bits
+   * at each frame.
+   *
+   * Note that dropping a key frame can be problematic if spatial
+   * resampling is also active
+   */
+  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
+    switch (cpi->decimation_factor) {
+      case 1:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
+        break;
+      case 2:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+        break;
+      case 3:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+        break;
+    }
+
+    /* Note that we should not throw out a key frame (especially when
+     * spatial resampling is enabled).
+     */
+    if (cm->frame_type == KEY_FRAME) {
+      cpi->decimation_count = cpi->decimation_factor;
+    } else if (cpi->decimation_count > 0) {
+      cpi->decimation_count--;
+
+      cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+      if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+      }
+
+#if CONFIG_MULTI_RES_ENCODING
+      vp8_store_drop_frame_info(cpi);
+#endif
+
+      cm->current_video_frame++;
+      cpi->frames_since_key++;
+      cpi->ext_refresh_frame_flags_pending = 0;
+      // We advance the temporal pattern for dropped frames.
+      cpi->temporal_pattern_counter++;
+
+#if CONFIG_INTERNAL_STATS
+      cpi->count++;
+#endif
+
+      cpi->buffer_level = cpi->bits_off_target;
+
+      if (cpi->oxcf.number_of_layers > 1) {
+        unsigned int i;
+
+        /* Propagate bits saved by dropping the frame to higher
+         * layers
+         */
+        for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
+          LAYER_CONTEXT *lc = &cpi->layer_context[i];
+          lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+          if (lc->bits_off_target > lc->maximum_buffer_size) {
+            lc->bits_off_target = lc->maximum_buffer_size;
+          }
+          lc->buffer_level = lc->bits_off_target;
+        }
+      }
+      return 1;
+    } else {
+      cpi->decimation_count = cpi->decimation_factor;
+    }
+  } else {
+    cpi->decimation_count = 0;
+  }
+  return 0;
+}
 
 static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                                       unsigned char *dest,
@@ -3208,12 +3316,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int undershoot_seen = 0;
 #endif
 
-  int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
-                        cpi->oxcf.optimal_buffer_level / 100);
-  int drop_mark75 = drop_mark * 2 / 3;
-  int drop_mark50 = drop_mark / 4;
-  int drop_mark25 = drop_mark / 8;
-
   /* Clear down mmx registers to allow floating point in what follows */
   vpx_clear_system_state();
 
@@ -3427,102 +3529,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   update_rd_ref_frame_probs(cpi);
 
-  if (cpi->drop_frames_allowed) {
-    /* The reset to decimation 0 is only done here for one pass.
-     * Once it is set two pass leaves decimation on till the next kf.
-     */
-    if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0)) {
-      cpi->decimation_factor--;
-    }
-
-    if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) {
-      cpi->decimation_factor = 1;
-
-    } else if (cpi->buffer_level < drop_mark25 &&
-               (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) {
-      cpi->decimation_factor = 3;
-    } else if (cpi->buffer_level < drop_mark50 &&
-               (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) {
-      cpi->decimation_factor = 2;
-    } else if (cpi->buffer_level < drop_mark75 &&
-               (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) {
-      cpi->decimation_factor = 1;
-    }
-  }
-
-  /* The following decimates the frame rate according to a regular
-   * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
-   * prevent buffer under-run in CBR mode. Alternatively it might be
-   * desirable in some situations to drop frame rate but throw more bits
-   * at each frame.
-   *
-   * Note that dropping a key frame can be problematic if spatial
-   * resampling is also active
-   */
-  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
-    switch (cpi->decimation_factor) {
-      case 1:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
-        break;
-      case 2:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
-        break;
-      case 3:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
-        break;
-    }
-
-    /* Note that we should not throw out a key frame (especially when
-     * spatial resampling is enabled).
-     */
-    if (cm->frame_type == KEY_FRAME) {
-      cpi->decimation_count = cpi->decimation_factor;
-    } else if (cpi->decimation_count > 0) {
-      cpi->decimation_count--;
-
-      cpi->bits_off_target += cpi->av_per_frame_bandwidth;
-      if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
-        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
-      }
-
-#if CONFIG_MULTI_RES_ENCODING
-      vp8_store_drop_frame_info(cpi);
-#endif
-
-      cm->current_video_frame++;
-      cpi->frames_since_key++;
-      cpi->ext_refresh_frame_flags_pending = 0;
-      // We advance the temporal pattern for dropped frames.
-      cpi->temporal_pattern_counter++;
-
-#if CONFIG_INTERNAL_STATS
-      cpi->count++;
-#endif
-
-      cpi->buffer_level = cpi->bits_off_target;
-
-      if (cpi->oxcf.number_of_layers > 1) {
-        unsigned int i;
-
-        /* Propagate bits saved by dropping the frame to higher
-         * layers
-         */
-        for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
-          LAYER_CONTEXT *lc = &cpi->layer_context[i];
-          lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
-          if (lc->bits_off_target > lc->maximum_buffer_size) {
-            lc->bits_off_target = lc->maximum_buffer_size;
-          }
-          lc->buffer_level = lc->bits_off_target;
-        }
-      }
-
-      return;
-    } else {
-      cpi->decimation_count = cpi->decimation_factor;
-    }
-  } else {
-    cpi->decimation_count = 0;
+  if (vp8_check_drop_buffer(cpi)) {
+    return;
   }
 
   /* Decide how big to make the frame */
@@ -3930,7 +3938,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     /* transform / motion compensation build reconstruction frame */
     vp8_encode_frame(cpi);
 
-    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+        cpi->rt_drop_recode_on_overshoot == 1) {
       if (vp8_drop_encodedframe_overshoot(cpi, Q)) {
         vpx_clear_system_state();
         return;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index cdf94f4f23..1451a27812 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -708,6 +708,10 @@ typedef struct VP8_COMP {
   // Always update correction factor used for rate control after each frame for
   // realtime encoding.
   int rt_always_update_correction_factor;
+
+  // Flag to indicate frame may be dropped due to large expected overshoot,
+  // and re-encoded on next frame at max_qp.
+  int rt_drop_recode_on_overshoot;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
@@ -732,6 +736,8 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
+int vp8_check_drop_buffer(VP8_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 20c44ff4e1..a6f0b4cbcf 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -624,10 +624,11 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
                                                       va_list args) {
   VP8_COMP *cpi = ctx->cpi;
-  const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+  const unsigned int data = CAST(VP8E_SET_RTC_EXTERNAL_RATECTRL, args);
   if (data) {
     cpi->cyclic_refresh_mode_enabled = 0;
     cpi->rt_always_update_correction_factor = 1;
+    cpi->rt_drop_recode_on_overshoot = 0;
   }
   return VPX_CODEC_OK;
 }
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index 60bc258a6f..dd3c8e623b 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -133,6 +133,8 @@ bool VP8RateControlRTC::UpdateRateControl(
   cpi_->buffered_mode = oxcf->optimal_buffer_level > 0;
   oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  if (oxcf->drop_frames_water_mark > 0) cpi_->drop_frames_allowed = 1;
   cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
   cpi_->framerate = rc_cfg.framerate;
   for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) {
@@ -208,7 +210,8 @@ bool VP8RateControlRTC::UpdateRateControl(
   return true;
 }
 
-void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
+FrameDropDecision VP8RateControlRTC::ComputeQP(
+    const VP8FrameParamsQpRTC &frame_params) {
   VP8_COMMON *const cm = &cpi_->common;
   vpx_clear_system_state();
   if (cpi_->oxcf.number_of_layers > 1) {
@@ -226,7 +229,20 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
     cpi_->common.frame_flags |= FRAMEFLAGS_KEY;
   }
 
-  vp8_pick_frame_size(cpi_);
+  cpi_->per_frame_bandwidth = static_cast<int>(
+      round(cpi_->oxcf.target_bandwidth / cpi_->output_framerate));
+  if (vp8_check_drop_buffer(cpi_)) {
+    if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+    return FrameDropDecision::kDrop;
+  }
+
+  if (!vp8_pick_frame_size(cpi_)) {
+    cm->current_video_frame++;
+    cpi_->frames_since_key++;
+    cpi_->ext_refresh_frame_flags_pending = 0;
+    if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+    return FrameDropDecision::kDrop;
+  }
 
   if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level &&
       cpi_->buffered_mode) {
@@ -290,6 +306,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
   q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target);
   vp8_set_quantizer(cpi_, q_);
   vpx_clear_system_state();
+  return FrameDropDecision::kOk;
 }
 
 int VP8RateControlRTC::GetQP() const { return q_; }
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index 496ef9eaad..5ffe54c47c 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -33,6 +33,11 @@ struct VP8FrameParamsQpRTC {
   int temporal_layer_id;
 };
 
+enum class FrameDropDecision {
+  kOk,    // Frame is encoded.
+  kDrop,  // Frame is dropped.
+};
+
 class VP8RateControlRTC {
  public:
   static std::unique_ptr<VP8RateControlRTC> Create(
@@ -46,7 +51,10 @@ class VP8RateControlRTC {
   // level is calculated from frame qp.
   int GetLoopfilterLevel() const;
   // int GetLoopfilterLevel() const;
-  void ComputeQP(const VP8FrameParamsQpRTC &frame_params);
+  // ComputeQP returns the QP is the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called.
+  FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
   void PostEncodeUpdate(uint64_t encoded_frame_size);
 

From 9142314c2cec2be364e6844d1630a056e7b0a3c8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 14 Nov 2023 17:57:04 -0800
Subject: [PATCH 869/926] ratectrl_rtc.h: fix a few typos

is -> if
returns -> computes

in the documentation for ComputeQP().

Change-Id: If70706736b0dc2ae56e45e2489dc208c61fd557a
---
 vp9/ratectrl_rtc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index a8dd5c42ff..7f624a5fe3 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -100,7 +100,7 @@ class VP9RateControlRTC {
   int GetQP() const;
   int GetLoopfilterLevel() const;
   bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
-  // ComputeQP returns the QP is the frame is not dropped (kOk return),
+  // ComputeQP computes the QP if the frame is not dropped (kOk return),
   // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
   // are not to be called (vp9_rc_postencode_update_drop_frame is already
   // called via ComputeQP if drop is decided).

From 9f8776ff4af45a412c247c3ebafee7d002c1094d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 20 Nov 2023 11:49:14 -0800
Subject: [PATCH 870/926] vp8_ratectrl_rtc.h: fix a few typos

is -> if
returns -> computes

in the documentation for ComputeQP().

This is the same as:
9142314c2 ratectrl_rtc.h: fix a few typos

+ remove a duplicate, commented out, version of GetLoopfilterLevel()

Change-Id: I8832e628b63b0b7dac6236631072f36ad55d90e8
---
 vp8/vp8_ratectrl_rtc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index 5ffe54c47c..4c174b1315 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -50,8 +50,7 @@ class VP8RateControlRTC {
   // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
   // level is calculated from frame qp.
   int GetLoopfilterLevel() const;
-  // int GetLoopfilterLevel() const;
-  // ComputeQP returns the QP is the frame is not dropped (kOk return),
+  // ComputeQP computes the QP if the frame is not dropped (kOk return),
   // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
   // are not to be called.
   FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params);

From 1231fce45ecee61b8fc97e8a61b729d11a562897 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 20 Nov 2023 11:54:50 -0800
Subject: [PATCH 871/926] vp8_dx_iface.c: add include for MAX_PARTITIONS

fixes clang-tidy warning:
no header providing "MAX_PARTITIONS" is directly included

Change-Id: Iba7a9d95df7f5bdee76e7975df764cd71461fc93
---
 vp8/vp8_dx_iface.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index d4e06a7bce..2e5d6dcfe8 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -20,6 +20,7 @@
 #include "vpx_version.h"
 #include "common/alloccommon.h"
 #include "common/common.h"
+#include "common/onyxc_int.h"
 #include "common/onyxd.h"
 #include "decoder/onyxd_int.h"
 #include "vpx_dsp/vpx_dsp_common.h"

From a8db542b24a50060e996feda00b0538c7c334909 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 29 Sep 2023 10:45:32 -0700
Subject: [PATCH 872/926] Add vpx_sse and vpx_highbd_sse

The code is ported from libaom's aom_sse and aom_highbd_sse at
commit 1e20d2da96515524864b21010dbe23809cff2e9b.

The vpx_sse and vpx_highbd_sse functions will be used by vpx_dsp/psnr.c.

Bug: webm:1819
Change-Id: I4fbffa9000ab92755de5387b1ddd4370cb7020f7
---
 test/sum_squares_test.cc       | 211 +++++++++++++++++
 vpx_dsp/arm/highbd_sse_neon.c  | 288 +++++++++++++++++++++++
 vpx_dsp/arm/sse_neon.c         | 210 +++++++++++++++++
 vpx_dsp/arm/sse_neon_dotprod.c | 223 ++++++++++++++++++
 vpx_dsp/arm/sum_neon.h         |  51 +++++
 vpx_dsp/sse.c                  |  58 +++++
 vpx_dsp/vpx_dsp.mk             |   6 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl   |   6 +
 vpx_dsp/x86/sse_avx2.c         | 401 +++++++++++++++++++++++++++++++++
 vpx_dsp/x86/sse_sse4.c         | 359 +++++++++++++++++++++++++++++
 10 files changed, 1813 insertions(+)
 create mode 100644 vpx_dsp/arm/highbd_sse_neon.c
 create mode 100644 vpx_dsp/arm/sse_neon.c
 create mode 100644 vpx_dsp/arm/sse_neon_dotprod.c
 create mode 100644 vpx_dsp/sse.c
 create mode 100644 vpx_dsp/x86/sse_avx2.c
 create mode 100644 vpx_dsp/x86/sse_sse4.c

diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 5abb464dc0..725d5eb853 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -21,9 +21,14 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::ValuesIn;
 
 namespace {
 const int kNumIterations = 10000;
@@ -126,4 +131,210 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
                                  &vpx_sum_squares_2d_i16_msa)));
 #endif  // HAVE_MSA
+
+typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int width, int height);
+
+struct TestSSEFuncs {
+  TestSSEFuncs(SSEFunc ref = nullptr, SSEFunc tst = nullptr, int depth = 0)
+      : ref_func(ref), tst_func(tst), bit_depth(depth) {}
+  SSEFunc ref_func;  // Pointer to reference function
+  SSEFunc tst_func;  // Pointer to tested function
+  int bit_depth;
+};
+
+typedef std::tuple<TestSSEFuncs, int> SSETestParam;
+
+class SSETest : public ::testing::TestWithParam<SSETestParam> {
+ public:
+  ~SSETest() override = default;
+  void SetUp() override {
+    params_ = GET_PARAM(0);
+    width_ = GET_PARAM(1);
+    is_hbd_ =
+#if CONFIG_VP9_HIGHBITDEPTH
+        params_.ref_func == vpx_highbd_sse_c;
+#else
+        false;
+#endif
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2));
+    ref_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2));
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(ref_, nullptr);
+  }
+
+  void TearDown() override {
+    vpx_free(src_);
+    vpx_free(ref_);
+  }
+  void RunTest(bool is_random, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride) {
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(src_);
+    uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_);
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!is_hbd_) {
+          src_[ii * stride + jj] = rnd_.Rand8();
+          ref_[ii * stride + jj] = rnd_.Rand8();
+        } else {
+          src16[ii * stride + jj] = rnd_(limit);
+          ref16[ii * stride + jj] = rnd_(limit);
+        }
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, uint8_t *data,
+                      int16_t val) {
+    uint16_t *data16 = reinterpret_cast<uint16_t *>(data);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!is_hbd_) {
+          data[ii * stride + jj] = static_cast<uint8_t>(val);
+        } else {
+          data16[ii * stride + jj] = val;
+        }
+      }
+    }
+  }
+
+ protected:
+  bool is_hbd_;
+  int width_;
+  TestSSEFuncs params_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
+
+void SSETest::RunTest(bool is_random, int width, int height, int run_times) {
+  int failed = 0;
+  vpx_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    int stride = 4 << rnd_(7);  // Up to 256 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (is_random) {
+      GenRandomData(width, height, stride);
+    } else {
+      const int msb = is_hbd_ ? 12 : 8;  // Up to 12 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src_, 0);
+        GenExtremeData(width, height, stride, ref_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src_, limit);
+        GenExtremeData(width, height, stride, ref_, 0);
+      }
+    }
+    int64_t res_ref, res_tst;
+    uint8_t *src = src_;
+    uint8_t *ref = ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (is_hbd_) {
+      src = CONVERT_TO_BYTEPTR(src_);
+      ref = CONVERT_TO_BYTEPTR(ref_);
+    }
+#endif
+    res_ref = params_.ref_func(src, stride, ref, stride, width, height);
+    res_tst = params_.tst_func(src, stride, ref, stride, width, height);
+    if (run_times > 1) {
+      vpx_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(src, stride, ref, stride, width, height);
+      }
+      vpx_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(vpx_usec_timer_elapsed(&ref_timer));
+
+      vpx_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(src, stride, ref, stride, width, height);
+      }
+      vpx_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(vpx_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%d\n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      if (!failed) {
+        failed = res_ref != res_tst;
+        EXPECT_EQ(res_ref, res_tst)
+            << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test ["
+            << width << "x" << height
+            << "] C output does not match optimized output.";
+      }
+    }
+  }
+}
+
+TEST_P(SSETest, OperationCheck) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(true, width_, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(SSETest, ExtremeValues) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(false, width_, height, 1);
+  }
+}
+
+TEST_P(SSETest, DISABLED_Speed) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(true, width_, height, 100);
+  }
+}
+
+#if HAVE_NEON
+TestSSEFuncs sse_neon[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_neon)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
+                         Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
+#endif  // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+TestSSEFuncs sse_neon_dotprod[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest,
+                         Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4)));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_SSE4_1
+TestSSEFuncs sse_sse4[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_sse4_1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_sse4_1)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
+                         Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+TestSSEFuncs sse_avx2[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_avx2)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
+                         Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000000..717ad6b19a
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,288 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+                                            const uint16_t *ref,
+                                            uint32x4_t *sse_acc0,
+                                            uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+                                       uint32x4_t *sse_acc0,
+                                       uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int height) {
+  uint32x4_t sse[16];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+  highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+  highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+  highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+  highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+  highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+  highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+  highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+    highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+    highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+    highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+    highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x16(sse);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[4];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint32x4_t sse[2];
+  highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  // Peel the first loop iteration.
+  uint16x4_t s = vld1_u16(src);
+  uint16x4_t r = vld1_u16(ref);
+
+  uint16x4_t abs_diff = vabd_u16(s, r);
+  uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    s = vld1_u16(src);
+    r = vld1_u16(ref);
+
+    abs_diff = vabd_u16(s, r);
+    sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int width, int height) {
+  // { 0, 1, 2, 3, 4, 5, 6, 7 }
+  uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+  uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+  uint64_t sse = 0;
+
+  do {
+    int w = width;
+    int offset = 0;
+
+    do {
+      uint16x8_t s = vld1q_u16(src + offset);
+      uint16x8_t r = vld1q_u16(ref + offset);
+      uint16x8_t abs_diff;
+      uint16x4_t abs_diff_lo;
+      uint16x4_t abs_diff_hi;
+      uint32x4_t sse_u32;
+
+      if (w < 8) {
+        // Mask out-of-range elements.
+        s = vandq_u16(s, remainder_mask);
+        r = vandq_u16(r, remainder_mask);
+      }
+
+      abs_diff = vabdq_u16(s, r);
+      abs_diff_lo = vget_low_u16(abs_diff);
+      abs_diff_hi = vget_high_u16(abs_diff);
+
+      sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+      sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+      sse += horizontal_long_add_uint32x4(sse_u32);
+
+      offset += 8;
+      w -= 8;
+    } while (w > 0);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return sse;
+}
+
+int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride,
+                            const uint8_t *ref8, int ref_stride, int width,
+                            int height) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  switch (width) {
+    case 4:
+      return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    case 128:
+      return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+                                 height);
+  }
+}
diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c
new file mode 100644
index 0000000000..0b4a6e504a
--- /dev/null
+++ b/vpx_dsp/arm/sse_neon.c
@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+                                 uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+  uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
+  uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
+}
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+                                uint32x4_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                uint32x4_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int width, int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        j += 8;
+      } while (j < width);
+
+      src += src_stride;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+  return horizontal_add_uint32x4(sse);
+}
+
+static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+    sse_16x1_neon(src + 64, ref + 64, &sse[0]);
+    sse_16x1_neon(src + 80, ref + 80, &sse[1]);
+    sse_16x1_neon(src + 96, ref + 96, &sse[0]);
+    sse_16x1_neon(src + 112, ref + 112, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_8x1_neon(src, ref, &sse);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int width, int height) {
+  switch (width) {
+    case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
+  }
+}
diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 0000000000..0f11b7cbb2
--- /dev/null
+++ b/vpx_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                         uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int width, int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j < width);
+
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  }
+  return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+    sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
+    sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
+    sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
+    sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_8x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_8x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse = vdup_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x2(sse);
+}
+
+int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int width,
+                             int height) {
+  switch (width) {
+    case 4:
+      return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 128:
+      return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+                                  height);
+  }
+}
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 75c170df60..11821dc10e 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -221,4 +221,55 @@ static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
 #endif
 }
 
+static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) {
+  return horizontal_long_add_uint32x4(a[0]) +
+         horizontal_long_add_uint32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) {
+  uint64x2_t sum = vpaddlq_u32(a[0]);
+  sum = vpadalq_u32(sum, a[1]);
+  sum = vpadalq_u32(sum, a[2]);
+  sum = vpadalq_u32(sum, a[3]);
+
+  return horizontal_add_uint64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+
+  return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t
+horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+  sum[0] = vpadalq_u32(sum[0], a[8]);
+  sum[1] = vpadalq_u32(sum[1], a[9]);
+  sum[0] = vpadalq_u32(sum[0], a[10]);
+  sum[1] = vpadalq_u32(sum[1], a[11]);
+  sum[0] = vpadalq_u32(sum[0], a[12]);
+  sum[1] = vpadalq_u32(sum[1], a[13]);
+  sum[0] = vpadalq_u32(sum[0], a[14]);
+  sum[1] = vpadalq_u32(sum[1], a[15]);
+
+  return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/vpx_dsp/sse.c b/vpx_dsp/sse.c
new file mode 100644
index 0000000000..6cb4b705f8
--- /dev/null
+++ b/vpx_dsp/sse.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Sum the square of the difference between every corresponding element of the
+ * buffers.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                  int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = abs(a[x] - b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                         int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+#endif
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 84fd969daa..93abf39ff6 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -31,10 +31,15 @@ DSP_SRCS-yes += bitwriter_buffer.c
 DSP_SRCS-yes += bitwriter_buffer.h
 DSP_SRCS-yes += psnr.c
 DSP_SRCS-yes += psnr.h
+DSP_SRCS-yes += sse.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+DSP_SRCS-$(HAVE_NEON) += arm/sse_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sse_neon_dotprod.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sse_sse4.c
+DSP_SRCS-$(HAVE_AVX2) += x86/sse_avx2.c
 endif
 
 ifeq ($(CONFIG_DECODERS),yes)
@@ -447,6 +452,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_sse_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c9cdc285f2..e9d63f6ef2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -744,6 +744,9 @@ ()
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
 
+add_proto qw/int64_t/, "vpx_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
+specialize qw/vpx_sse sse4_1 avx2 neon neon_dotprod/;
+
 #
 # Single block SAD
 #
@@ -1026,6 +1029,9 @@ ()
   add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
   specialize qw/vpx_highbd_subtract_block neon avx2/;
 
+  add_proto qw/int64_t/, "vpx_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+  specialize qw/vpx_highbd_sse sse4_1 avx2 neon/;
+
   #
   # Single block SAD
   #
diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c
new file mode 100644
index 0000000000..975446775e
--- /dev/null
+++ b/vpx_dsp/x86/sse_avx2.c
@@ -0,0 +1,401 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+  int64_t sum;
+  __m256i zero = _mm256_setzero_si256();
+  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+  const __m256i sum0_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+  const __m256i sum1_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+  int64_t sum;
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = load_unaligned_u32(a);
+  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+  const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2);
+  const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3);
+  const __m128i v_b0 = load_unaligned_u32(b);
+  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+  const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2);
+  const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3);
+  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+                                             _mm_unpacklo_epi32(v_a2, v_a3));
+  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+                                             _mm_unpacklo_epi32(v_b2, v_b3));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  __m256i sum = _mm256_setzero_si256();
+  __m256i zero = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+        const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride));
+        const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+        const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride));
+        const __m256i v_a =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+        const __m256i v_b =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+        const __m256i temp =
+            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+                             _mm256_madd_epi16(v_bsub, v_bsub));
+        sum = _mm256_add_epi32(sum, temp);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 128:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        sse_w32_avx2(&sum, a + 64, b + 64);
+        sse_w32_avx2(&sum, a + 96, b + 96);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default:
+      if ((width & 0x07) == 0) {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride << 1;
+          b += b_stride << 1;
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            const uint8_t *a2;
+            const uint8_t *b2;
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            a2 = a + i + (a_stride << 1);
+            b2 = b + i + (b_stride << 1);
+            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      }
+      sse = summary_all_avx2(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+                                       const uint16_t *b) {
+  const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2));
+  const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2));
+  const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3));
+  const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3);
+  const __m256i v_a_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+  const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3);
+  const __m256i v_b_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride));
+  const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a);
+  const __m256i v_a_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+  const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride));
+  const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b);
+  const __m256i v_b_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 128:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          int i = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            const uint16_t *a2;
+            const uint16_t *b2;
+            highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+            a2 = a + i + (a_stride << 1);
+            b2 = b + i + (b_stride << 1);
+            highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+          summary_32_avx2(&sum32, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+              i += 8;
+            } while (i < width);
+            a += a_stride << 1;
+            b += b_stride << 1;
+            l += 2;
+          } while (l < 8 && l < (height - y));
+          summary_32_avx2(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      sse = summary_4x64_avx2(sum);
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c
new file mode 100644
index 0000000000..1c2744e2fa
--- /dev/null
+++ b/vpx_dsp/x86/sse_sse4.c
@@ -0,0 +1,359 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+  int64_t sum;
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+  *sum64 = _mm_add_epi64(sum0, *sum64);
+  *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+                                  const uint8_t *b) {
+  const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+  const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m128i *sum) {
+  const __m128i v_a0 = load_unaligned_u32(a);
+  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+  const __m128i v_b0 = load_unaligned_u32(b);
+  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+  const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+                               __m128i *sum) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  int y = 0;
+  int64_t sse = 0;
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        sse8_sse4_1(a, b, &sum);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 128:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default:
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
+      }
+      sse = summary_all_sse4(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+                                          int a_stride, const uint16_t *b,
+                                          int b_stride) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+                                        const uint16_t *b) {
+  const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a);
+  const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int width,
+                              int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 128:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 8 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 8;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          __m128i sum32 = _mm_setzero_si128();
+          int i = 0;
+          do {
+            highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+            highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+          summary_32_sse4(&sum32, &sum);
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m128i sum32 = _mm_setzero_si128();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+              i += 8;
+            } while (i < width);
+            a += a_stride;
+            b += b_stride;
+            l += 1;
+          } while (l < 8 && l < (height - y));
+          summary_32_sse4(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

From 56c78a68b0a881b7f039c0de662cc758ab40d08c Mon Sep 17 00:00:00 2001
From: Hirokazu Honda <hiroh@chromium.org>
Date: Tue, 21 Nov 2023 17:03:53 +0900
Subject: [PATCH 873/926] ratectrl_rtc: Remove duplicated DropFrameReason enum
 class

DropFrameReason is declared in two places. This moves it
to the common place.

Change-Id: I04c16db4a49135588edff7e1746dcf9172750bb9
---
 vp8/vp8_ratectrl_rtc.h          | 5 -----
 vp9/ratectrl_rtc.h              | 5 -----
 vpx/internal/vpx_ratectrl_rtc.h | 5 +++++
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index 4c174b1315..59fb607526 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -33,11 +33,6 @@ struct VP8FrameParamsQpRTC {
   int temporal_layer_id;
 };
 
-enum class FrameDropDecision {
-  kOk,    // Frame is encoded.
-  kDrop,  // Frame is dropped.
-};
-
 class VP8RateControlRTC {
  public:
   static std::unique_ptr<VP8RateControlRTC> Create(
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index 7f624a5fe3..85005c5474 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -64,11 +64,6 @@ struct VP9SegmentationData {
   size_t delta_q_size;
 };
 
-enum class FrameDropDecision {
-  kOk,    // Frame is encoded.
-  kDrop,  // Frame is dropped.
-};
-
 // This interface allows using VP9 real-time rate control without initializing
 // the encoder. To use this interface, you need to link with libvpxrc.a.
 //
diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h
index eb90cd1d0c..6ffd798eb2 100644
--- a/vpx/internal/vpx_ratectrl_rtc.h
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -17,6 +17,11 @@ namespace libvpx {
 
 enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 };
 
+enum class FrameDropDecision {
+  kOk,    // Frame is encoded.
+  kDrop,  // Frame is dropped.
+};
+
 struct VpxRateControlRtcConfig {
  public:
   VpxRateControlRtcConfig() {

From 741b8f6228984e888c99849d7675ea4132eaf268 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 21 Nov 2023 11:18:54 -0500
Subject: [PATCH 874/926] Check null ptr before use

prev_mi is a pointer to pointer

Bug: b/310401647
Bug: b/310590556
Change-Id: Ic3c39a7eec14693357bd2485a5451d4b7f031b5e
---
 test/encode_api_test.cc       | 85 +++++++++++++++++++++++++++++++++++
 vp9/encoder/vp9_encodeframe.c | 16 +++----
 2 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 770052c859..d2469572a6 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -496,6 +496,91 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
 }
 #endif  // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
 
+vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
+  vpx_image_t *image =
+      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1);
+  if (!image) return image;
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+
+  return image;
+}
+
+// This is a test case from clusterfuzz.
+TEST(EncodeAPI, PrevMiCheckNullptr) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+  vpx_codec_enc_cfg_t cfg;
+
+  struct Config {
+    unsigned int thread;
+    unsigned int width;
+    unsigned int height;
+    vpx_rc_mode end_usage;
+    unsigned long deadline;
+  };
+  struct Config init_config = { 0, 1554, 644, VPX_VBR, 1 };
+  unsigned long deadline = init_config.deadline;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
+            VPX_CODEC_OK);
+  cfg.g_threads = init_config.thread;
+  cfg.g_w = init_config.width;
+  cfg.g_h = init_config.height;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 1000 * 1000;  // microseconds
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = init_config.end_usage;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 58;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 0), VPX_CODEC_OK);
+
+  const vpx_codec_cx_pkt_t *pkt;
+
+  int frame_index = 0;
+  // First step: encode, without forcing KF.
+  vpx_image_t *image = CreateImage(cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+  ASSERT_EQ(vpx_codec_encode(&enc, image, frame_index, 1, 0, deadline),
+            VPX_CODEC_OK);
+  frame_index++;
+  vpx_codec_iter_t iter = nullptr;
+  while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+  }
+  vpx_img_free(image);
+  // Second step: change config
+  struct Config encode_config = { 0, 1131, 644, VPX_CBR, 1000000 };
+  cfg.g_threads = encode_config.thread;
+  cfg.g_w = encode_config.width;
+  cfg.g_h = encode_config.height;
+  cfg.rc_end_usage = encode_config.end_usage;
+  deadline = encode_config.deadline;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc);
+  // Third step: encode, without forcing KF
+  image = CreateImage(cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+  ASSERT_EQ(vpx_codec_encode(&enc, image, frame_index, 1, 0, deadline),
+            VPX_CODEC_OK);
+  frame_index++;
+  while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+  }
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 67869596b1..63306ac381 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3052,14 +3052,12 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row,
   min_size = BLOCK_64X64;
   max_size = BLOCK_4X4;
 
-  if (prev_mi) {
-    for (idy = 0; idy < mi_height; ++idy) {
-      for (idx = 0; idx < mi_width; ++idx) {
-        mi = prev_mi[idy * cm->mi_stride + idx];
-        bs = mi ? mi->sb_type : bsize;
-        min_size = VPXMIN(min_size, bs);
-        max_size = VPXMAX(max_size, bs);
-      }
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      mi = prev_mi[idy * cm->mi_stride + idx];
+      bs = mi ? mi->sb_type : bsize;
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
     }
   }
 
@@ -3205,7 +3203,7 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
       left_par = 1;
   }
 
-  if (prev_mi) {
+  if (prev_mi[0]) {
     context_size = prev_mi[0]->sb_type;
     if (context_size < bsize)
       last_par = 2;

From 79257fd4595299415236d57fc750dcbb2c276ccb Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Wed, 22 Nov 2023 15:06:08 -0800
Subject: [PATCH 875/926] Conditionally skip using inter frames in speed
 features

When the reference frame's scaling factor is not in the supported
range, skip using it for motion compensation prediction in the
partition speed features.

BUG=b/312517065

Change-Id: Ie3687186521ad2616be258e80d3e5b16e5f2d5e9
---
 vp9/encoder/vp9_encodeframe.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 63306ac381..d0863b723b 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1301,6 +1301,13 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       (frame_is_intra_only(cm) ||
        (is_one_pass_svc(cpi) &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+  if (!is_key_frame) {
+    if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+        cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+      is_key_frame = 1;
+  }
+
   // Always use 4x4 partition for key frame.
   const int use_4x4_partition = frame_is_intra_only(cm);
   const int low_res = (cm->width <= 352 && cm->height <= 288);

From b562fdd4e6833110b0c749c8aaf690e5fc16f1cd Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Wed, 22 Nov 2023 15:07:04 -0800
Subject: [PATCH 876/926] Remove invalid reference frames

Remove the reference frames whose scaling factor is not in the
supported range.

BUG=b/312517065

Change-Id: Iaf8610ff7a95cd4a433bf529f741459d820d4f8b
---
 vp9/encoder/vp9_encodeframe.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d0863b723b..b98fd84579 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -6144,6 +6144,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
       cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
   }
 
+  for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+       ++ref_frame) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+      if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+          cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+        cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame);
+    }
+  }
+
   // Frame segmentation
   if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi);
 

From 635eba3319cb6e20c2a72902b6fb3dd2fc8e93ef Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 22 Nov 2023 15:38:04 -0800
Subject: [PATCH 877/926] Adding "const" to vpx_codec_iface_t is redundant

vpx_codec_iface_t is defined as follows:

  typedef const struct vpx_codec_iface vpx_codec_iface_t;

Since vpx_codec_iface_t is already a const struct, it is redundant to
add "const" to vpx_codec_iface_t.

Note: I think vpx_codec_iface_t should not have been defined as a const
struct, but it is too late to change that now.

Change-Id: Ifbd3f8a63c1d48e9169ff77fa0b505ea1e65519d
---
 test/decode_api_test.cc | 6 +++---
 test/encode_api_test.cc | 8 ++++----
 test/level_test.cc      | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 9e82ace1b8..44e4397726 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -20,7 +20,7 @@ namespace {
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
 TEST(DecodeAPI, InvalidParams) {
-  static const vpx_codec_iface_t *kCodecs[] = {
+  static vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_DECODER
     &vpx_codec_vp8_dx_algo,
 #endif
@@ -120,7 +120,7 @@ void TestVp9Controls(vpx_codec_ctx_t *dec) {
 }
 
 TEST(DecodeAPI, Vp9InvalidDecode) {
-  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
   const char filename[] =
       "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf";
   libvpx_test::IVFVideoSource video(filename);
@@ -147,7 +147,7 @@ TEST(DecodeAPI, Vp9InvalidDecode) {
 
 void TestPeekInfo(const uint8_t *const data, uint32_t data_sz,
                   uint32_t peek_size) {
-  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
   // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
   // to decoder_peek_si_internal on frames of size < 8.
   if (data_sz >= 8) {
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index d2469572a6..cbbbb4ddce 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -28,7 +28,7 @@
 
 namespace {
 
-const vpx_codec_iface_t *kCodecIfaces[] = {
+vpx_codec_iface_t *kCodecIfaces[] = {
 #if CONFIG_VP8_ENCODER
   &vpx_codec_vp8_cx_algo,
 #endif
@@ -37,7 +37,7 @@ const vpx_codec_iface_t *kCodecIfaces[] = {
 #endif
 };
 
-bool IsVP9(const vpx_codec_iface_t *iface) {
+bool IsVP9(vpx_codec_iface_t *iface) {
   static const char kVP9Name[] = "WebM Project VP9";
   return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
          0;
@@ -259,7 +259,7 @@ TEST(EncodeAPI, MultiResEncode) {
 
 TEST(EncodeAPI, SetRoi) {
   static struct {
-    const vpx_codec_iface_t *iface;
+    vpx_codec_iface_t *iface;
     int ctrl_id;
   } kCodecs[] = {
 #if CONFIG_VP8_ENCODER
@@ -365,7 +365,7 @@ TEST(EncodeAPI, SetRoi) {
   }
 }
 
-void InitCodec(const vpx_codec_iface_t &iface, int width, int height,
+void InitCodec(vpx_codec_iface_t &iface, int width, int height,
                vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) {
   cfg->g_w = width;
   cfg->g_h = height;
diff --git a/test/level_test.cc b/test/level_test.cc
index 3f1cf9f1c5..36cfd645c9 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -120,7 +120,7 @@ TEST_P(LevelTest, TestTargetLevel255) {
 
 TEST_P(LevelTest, TestTargetLevelApi) {
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
-  static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
+  static vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
   vpx_codec_ctx_t enc;
   vpx_codec_enc_cfg_t cfg;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));

From 3bd54a37d0f820970352326941224afb618af808 Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Wed, 22 Nov 2023 15:38:27 -0800
Subject: [PATCH 878/926] Disable intra mode search speed features
 conditionally

When all the inter reference frames are invalid, disable the speed
features that bypass intra mode search.

BUG=b/312517065

Change-Id: I246c953fad3be61b9d307da11c752a21a36b90ff
---
 vp9/encoder/vp9_rdopt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index fc06967105..974e43c90f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3606,7 +3606,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
   }
 
-  if (bsize > sf->max_intra_bsize) {
+  if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) {
     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
   }

From 366425079ba685bcd78511297dacb327ec363abe Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 22 Nov 2023 14:49:56 -0800
Subject: [PATCH 879/926] Tests kf_max_dist in one-pass zero-lag encoding

The test shows that the comment for kf_max_dist in vpx/vpx_encoder.h
differs from its behavior by one. We should modify the comment to match
the encoding behavior.

Bug: webm:1829
Change-Id: Icdc58b8f6b25353f10ce8ecc481c862bd3fe86df
---
 test/keyframe_test.cc | 107 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc
index dabf88e415..5292bb188d 100644
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -8,12 +8,18 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <climits>
+#include <cstring>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "./vpx_config.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_image.h"
 
 namespace {
 
@@ -146,4 +152,105 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
 }
 
 VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES);
+
+bool IsVP9(vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
+vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w,
+                             unsigned int h) {
+  vpx_image_t *const image = vpx_img_alloc(nullptr, fmt, w, h, 1);
+  if (!image) return image;
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+  return image;
+}
+
+// Tests kf_max_dist in one-pass encoding with zero lag.
+void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface,
+                                 unsigned long deadline,
+                                 unsigned int kf_max_dist) {
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
+            VPX_CODEC_OK);
+  cfg.g_w = 320;
+  cfg.g_h = 240;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.kf_mode = VPX_KF_AUTO;
+  cfg.kf_min_dist = 0;
+  cfg.kf_max_dist = kf_max_dist;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  const int speed = IsVP9(iface) ? 9 : -12;
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, speed), VPX_CODEC_OK);
+
+  vpx_image_t *image = CreateGrayImage(VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frames.
+  const vpx_codec_cx_pkt_t *pkt;
+  const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1;
+  for (unsigned int i = 0; i < num_frames; ++i) {
+    ASSERT_EQ(vpx_codec_encode(&enc, image, i, 1, 0, deadline), VPX_CODEC_OK);
+    vpx_codec_iter_t iter = nullptr;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+      if (kf_max_dist == 0 || i % kf_max_dist == 0) {
+        ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+      } else {
+        ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, 0u);
+      }
+    }
+  }
+
+  // Flush the encoder.
+  bool got_data;
+  do {
+    ASSERT_EQ(vpx_codec_encode(&enc, nullptr, 0, 1, 0, deadline), VPX_CODEC_OK);
+    got_data = false;
+    vpx_codec_iter_t iter = nullptr;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+      got_data = true;
+    }
+  } while (got_data);
+
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(KeyframeIntervalTest, KeyframeMaximumInterval) {
+  std::vector<vpx_codec_iface_t *> ifaces;
+#if CONFIG_VP8_ENCODER
+  ifaces.push_back(vpx_codec_vp8_cx());
+#endif
+#if CONFIG_VP9_ENCODER
+  ifaces.push_back(vpx_codec_vp9_cx());
+#endif
+  for (vpx_codec_iface_t *iface : ifaces) {
+    for (unsigned long deadline :
+         { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) {
+      // Test 0 and 1 (both mean all intra), some powers of 2, some multiples
+      // of 10, and some prime numbers.
+      for (unsigned int kf_max_dist :
+           { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) {
+        TestKeyframeMaximumInterval(iface, deadline, kf_max_dist);
+      }
+    }
+  }
+}
+
 }  // namespace

From 9b729500d5a257d4694b9d50237ebf0e77b5fff9 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 23 Nov 2023 10:13:46 +0000
Subject: [PATCH 880/926] Delete redundant code in Neon SDOT/USDOT vertical
 convolutions

Delete redundant transpose/permute code in the Neon dot-product
vertical convolution paths. Variable values were assigned but never
used before subsequent assignment.

Change-Id: I15b29d0c993f56599e0d18ac1d5787e6385d2a3a
---
 vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 30 ---------------------
 vpx_dsp/arm/vpx_convolve8_neon_i8mm.c    | 34 ------------------------
 2 files changed, 64 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
index bf01364cf7..75e9d03929 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -397,9 +397,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
     s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
     s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
     s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
 
     /* This operation combines a conventional transpose and the sample permute
      * (see horizontal case) required before computing the dot product.
@@ -408,9 +405,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
     transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
     transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
     transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
 
     do {
       uint8x8_t t7, t8, t9, t10;
@@ -478,9 +472,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
       s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
       s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
       s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
 
       /* This operation combines a conventional transpose and the sample permute
        * (see horizontal case) required before computing the dot product.
@@ -493,12 +484,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                            tran_concat_tbl);
       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
 
       do {
         uint8x8_t t7, t8, t9, t10;
@@ -601,9 +586,6 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
     s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
     s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
     s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
 
     /* This operation combines a conventional transpose and the sample permute
      * (see horizontal case) required before computing the dot product.
@@ -612,9 +594,6 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
     transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
     transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
     transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
 
     do {
       uint8x8_t t7, t8, t9, t10;
@@ -688,9 +667,6 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
       s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
       s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
       s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
 
       /* This operation combines a conventional transpose and the sample permute
        * (see horizontal case) required before computing the dot product.
@@ -703,12 +679,6 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
                            tran_concat_tbl);
       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
 
       do {
         uint8x8_t t7, t8, t9, t10;
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
index e0e482e3f5..2ea587e622 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -375,10 +375,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src += 7 * src_stride;
 
-    s7 = vdup_n_u8(0);
-    s8 = vdup_n_u8(0);
-    s9 = vdup_n_u8(0);
-
     /* This operation combines a conventional transpose and the sample permute
      * (see horizontal case) required before computing the dot product.
      */
@@ -386,9 +382,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
     transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
     transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
 
     do {
       load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
@@ -441,10 +434,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
-      s7 = vdup_n_u8(0);
-      s8 = vdup_n_u8(0);
-      s9 = vdup_n_u8(0);
-
       /* This operation combines a conventional transpose and the sample permute
        * (see horizontal case) required before computing the dot product.
        */
@@ -456,12 +445,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                            tran_concat_tbl);
       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
 
       do {
         load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
@@ -544,10 +527,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src += 7 * src_stride;
 
-    s7 = vdup_n_u8(0);
-    s8 = vdup_n_u8(0);
-    s9 = vdup_n_u8(0);
-
     /* This operation combines a conventional transpose and the sample permute
      * (see horizontal case) required before computing the dot product.
      */
@@ -555,9 +534,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
     transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
     transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
 
     do {
       load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
@@ -616,10 +592,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
-      s7 = vdup_n_u8(0);
-      s8 = vdup_n_u8(0);
-      s9 = vdup_n_u8(0);
-
       /* This operation combines a conventional transpose and the sample permute
        * (see horizontal case) required before computing the dot product.
        */
@@ -631,12 +603,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                            tran_concat_tbl);
       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
 
       do {
         load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);

From 1b3ec0676cfbb79cc1ceb17add1af543ad8d4f4c Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 23 Nov 2023 15:11:11 +0000
Subject: [PATCH 881/926] Make reporting of filter sizes more granular

vpx_get_filter_taps() currently reports either 8-tap or 2-tap.
However, many 8-tap filters are actually 0-padded, resulting in a
lot of redundant work (multiplying by, and adding, 0) when processing
using an 8-tap convolution function. In preparation for adding 2- and
4-tap SIMD implementations for the convolution paths, make the filter
size reporting more granular, stripping any 0 padding. Filter sizes
can now be reported as 2-, 4-, 6- or 8-tap.

Change-Id: I100133aac7173134af34b918c9ad3007d98d6060
---
 vpx_dsp/vpx_filter.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h
index 54357ee6ca..0cddcb6991 100644
--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -29,10 +29,16 @@ typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
   assert(filter[3] != 128);
-  if (!filter[0] && !filter[1] && !filter[2])
-    return 2;
-  else
+  if (filter[0] | filter[7]) {
     return 8;
+  }
+  if (filter[1] | filter[6]) {
+    return 6;
+  }
+  if (filter[2] | filter[5]) {
+    return 4;
+  }
+  return 2;
 }
 
 #ifdef __cplusplus

From bdc9e1c9d4e8ac536e5e8e9a6c905c05fad8a03a Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 22 Nov 2023 17:38:32 +0000
Subject: [PATCH 882/926] Specialise Armv8.4 Neon vert convolution for 4-tap
 filters

Add an Armv8.4 SDOT Neon implementation of vertical convolution
specialised for executing with 4-tap filters (the most common filter
size for settings --good --cpu-used=1.) This new path is also used
when executing with bilinear (2-tap) filters.

Change-Id: I3eb00b5a34f5676b68bda60a2a29be56e3d7d0cd
---
 vpx_dsp/arm/vpx_convolve8_neon.h         |  27 +++
 vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 221 ++++++++++++++++++++---
 vpx_dsp/arm/vpx_convolve_neon_dotprod.c  |  26 ++-
 3 files changed, 240 insertions(+), 34 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 025e943cc4..52df15a121 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -26,6 +26,33 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
                                          int x_step_q4, int y0_q4,
                                          int y_step_q4, int w, int h);
 
+static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo,
+                                                 const int8x16_t samples_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0);
+  /* Second 4 output values. */
+  int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0);
+
+  /* Narrow and re-pack. */
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
 static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
                                                  const int32x4_t correction,
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
index 75e9d03929..cdb410bd58 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -356,29 +356,166 @@ static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
   *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
 }
 
-void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const InterpKernel *filter, int x0_q4,
-                                     int x_step_q4, int y0_q4, int y_step_q4,
-                                     int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
+static INLINE void vpx_convolve_4tap_vert_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x8_t range_limit) {
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
   uint8x8_t t0, t1, t2, t3, t4, t5, t6;
   int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   int8x16x2_t samples_LUT;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(y_step_q4 == 16);
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
 
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
 
-  src -= 3 * src_stride;
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      d0 = convolve4_4_sdot_partial(s0123, correction, filter);
+      d1 = convolve4_4_sdot_partial(s1234, correction, filter);
+      d2 = convolve4_4_sdot_partial(s2345, correction, filter);
+      d3 = convolve4_4_sdot_partial(s3456, correction, filter);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter);
+        d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter);
+        d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter);
+        d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        s3456_lo = s78910_lo;
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x8_t range_limit) {
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
 
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
@@ -425,10 +562,10 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
@@ -512,13 +649,13 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
         s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
         d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filters);
+                                      correction, filter);
         d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filters);
+                                      correction, filter);
         d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filters);
+                                      correction, filter);
         d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filters);
+                                      correction, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -544,6 +681,42 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h) {
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int32x4_t correction_8tap =
+      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS)));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t y_filter_4tap =
+        vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
+    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
+    vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst,
+                                        dst_stride, w, h, y_filter_4tap,
+                                        correction_4tap, range_limit);
+  } else {
+    vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
+                                        dst_stride, w, h, y_filter_8tap,
+                                        correction_8tap, range_limit);
+  }
+}
+
 void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
                                          ptrdiff_t src_stride, uint8_t *dst,
                                          ptrdiff_t dst_stride,
diff --git a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
index 400e26b30a..9d754fde17 100644
--- a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
+++ b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
@@ -13,6 +13,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
@@ -20,24 +21,26 @@ void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                 const InterpKernel *filter, int x0_q4,
                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
    * maximum buffer size to 64 * (64 + 7). */
   uint8_t temp[64 * 71];
 
-  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
-  const int intermediate_height = h + 7;
+  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+   * and vert_filter_taps / 2 lines post. */
+  const int intermediate_height = h + vert_filter_taps - 1;
+  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting 3 lines back. */
-  vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
-                                      filter, x0_q4, x_step_q4, y0_q4,
-                                      y_step_q4, w, intermediate_height);
+  vpx_convolve8_2d_horiz_neon_dotprod(
+      src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4,
+      x_step_q4, y0_q4, y_step_q4, w, intermediate_height);
 
-  /* Step into the temp buffer 3 lines to get the actual frame data. */
-  vpx_convolve8_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
-                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+  vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride,
+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                                  h);
 }
 
 void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
@@ -46,6 +49,9 @@ void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                     int x_step_q4, int y0_q4, int y_step_q4,
                                     int w, int h) {
   uint8_t temp[64 * 71];
+
+  /* Averaging convolution always uses an 8-tap filter. */
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
   const int intermediate_height = h + 7;
 
   assert(y_step_q4 == 16);

From 2f8e94715df1beddcd02df35f55d043d4fdabafc Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 23 Nov 2023 17:23:13 +0000
Subject: [PATCH 883/926] Specialise Armv8.6 Neon vert convolution for 4-tap
 filters

Add an Armv8.6 USDOT Neon implementation of vertical convolution
specialised for executing with 4-tap filters (the most common filter
size for settings --good --cpu-used=1.) This new path is also used
when executing with bilinear (2-tap) filters.

Change-Id: Ic893b25541e3317c5d5c270c338f868f080aed7c
---
 vpx_dsp/arm/vpx_convolve8_neon.h      |  23 ++++
 vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 179 +++++++++++++++++++++++---
 vpx_dsp/arm/vpx_convolve_neon_i8mm.c  |  26 ++--
 3 files changed, 197 insertions(+), 31 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 52df15a121..4f4fe4c81c 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -159,6 +159,29 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                       int x_step_q4, int y0_q4, int y_step_q4,
                                       int w, int h);
 
+static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples,
+                                                  const int8x8_t filters) {
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo,
+                                                  const uint8x16_t samples_hi,
+                                                  const int8x8_t filters) {
+  /* Sample permutation is performed by the caller. */
+  /* First 4 output values. */
+  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  /* Second 4 output values. */
+  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0);
+
+  /* Narrow and re-pack. */
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
 static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
                                                   const int8x8_t filters) {
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
index 2ea587e622..7c394a937b 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -346,25 +346,132 @@ static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
   *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
 }
 
-void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *filter, int x0_q4,
-                                  int x_step_q4, int y0_q4, int y_step_q4,
-                                  int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+static INLINE void vpx_convolve_4tap_vert_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   uint8x16x2_t samples_LUT;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(y_step_q4 == 16);
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
 
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
 
-  src -= 3 * src_stride;
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      d0 = convolve4_4_usdot_partial(s0123, filter);
+      d1 = convolve4_4_usdot_partial(s1234, filter);
+      d2 = convolve4_4_usdot_partial(s2345, filter);
+      d3 = convolve4_4_usdot_partial(s3456, filter);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter);
+        d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter);
+        d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter);
+        d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        s3456_lo = s78910_lo;
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
 
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
@@ -395,10 +502,10 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
@@ -466,13 +573,13 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
         s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
         d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filters);
+                                       filter);
         d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filters);
+                                       filter);
         d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filters);
+                                       filter);
         d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filters);
+                                       filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -498,6 +605,36 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t y_filter_4tap =
+        vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
+    vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst,
+                                     dst_stride, w, h, y_filter_4tap);
+  } else {
+    vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
+                                     dst_stride, w, h, y_filter_8tap);
+  }
+}
+
 void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *filter, int x0_q4,
diff --git a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
index 4d94bb79b7..d7cbb09ea6 100644
--- a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
+++ b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
@@ -13,6 +13,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
@@ -20,24 +21,26 @@ void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                              const InterpKernel *filter, int x0_q4,
                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
    * maximum buffer size to 64 * (64 + 7). */
   uint8_t temp[64 * 71];
 
-  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
-  const int intermediate_height = h + 7;
+  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+   * and vert_filter_taps / 2 lines post. */
+  const int intermediate_height = h + vert_filter_taps - 1;
+  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting 3 lines back. */
-  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
-                                   filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
-                                   w, intermediate_height);
+  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride,
+                                   temp, w, filter, x0_q4, x_step_q4, y0_q4,
+                                   y_step_q4, w, intermediate_height);
 
-  /* Step into the temp buffer 3 lines to get the actual frame data. */
-  vpx_convolve8_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
-                               x_step_q4, y0_q4, y_step_q4, w, h);
+  vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride,
+                               filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                               h);
 }
 
 void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
@@ -46,6 +49,9 @@ void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                  int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h) {
   uint8_t temp[64 * 71];
+
+  /* Averaging convolution always uses an 8-tap filter. */
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
   const int intermediate_height = h + 7;
 
   assert(y_step_q4 == 16);

From 68ef57f997e8505dbfceb1bc146373dfd035087a Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 21 Nov 2023 17:39:28 +0000
Subject: [PATCH 884/926] Specialise Armv8.4 Neon horiz convolution for 4-tap
 filters

Add an Armv8.4 SDOT Neon implementation of horizontal convolution
specialised for executing with 4-tap filters (the most common filter
size for settings --good --cpu-used=1.) This new path is also used
when executing with bilinear (2-tap) filters.

Change-Id: Ib396681b3f7b8b0eeba94381fbe33a06cf7b4a13
---
 vpx_dsp/arm/vpx_convolve8_neon.h         |  48 +++++++++
 vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 131 ++++++++++++++++++-----
 2 files changed, 155 insertions(+), 24 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 4f4fe4c81c..ca9f816bc4 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -36,6 +36,26 @@ static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples,
   return vmovn_s32(sum);
 }
 
+static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16_t permute_tbl) {
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  int8x16_t clamped_samples =
+      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vmovn_s32(sum);
+}
+
 static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
                                                  const int32x4_t correction,
@@ -53,6 +73,34 @@ static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
+static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  /* Second 4 output values. */
+  int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+
+  /* Narrow and re-pack. */
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
 static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
                                                  const int32x4_t correction,
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
index cdb410bd58..6970bb1cd2 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -149,26 +149,72 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
   }
 }
 
-void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *filter, int x0_q4,
-                                      int x_step_q4, int y0_q4, int y_step_q4,
-                                      int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
+static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x16_t range_limit) {
   uint8x16_t s0, s1, s2, s3;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
+  if (w == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
 
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-  src -= 3;
+      t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x16_t range_limit) {
+  uint8x16_t s0, s1, s2, s3;
 
   if (w == 4) {
     const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
@@ -178,10 +224,10 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
       d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
 
@@ -206,10 +252,10 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -224,6 +270,43 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int32x4_t correction_8tap =
+      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t x_filter_4tap =
+        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
+    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
+    vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride,
+                                         w, h, x_filter_4tap, correction_4tap,
+                                         range_limit);
+
+  } else {
+    vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride,
+                                         w, h, x_filter_8tap, correction_8tap,
+                                         range_limit);
+  }
+}
+
 void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
                                           ptrdiff_t src_stride, uint8_t *dst,
                                           ptrdiff_t dst_stride,

From 9cdb68891943a4cba72dcd0c2bf776a609cf7cb6 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 22 Nov 2023 11:33:07 +0000
Subject: [PATCH 885/926] Specialise Armv8.4 Neon 2D horiz convolution for
 4-tap filters

Add an Armv8.4 SDOT Neon path for the horizontal portion of 2D
convolution, specialised for executing with 4-tap filters (the most
common filter size for settings --good --cpu-used=1.) This new path
is also used when executing with bilinear (2-tap) filters.

Change-Id: I5116d10ddb371ac2cf302ef905d06f2140dc7600
---
 vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 178 +++++++++++++++++++----
 1 file changed, 146 insertions(+), 32 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
index 6970bb1cd2..00bac3b9cf 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -40,28 +40,104 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
-                                         ptrdiff_t src_stride, uint8_t *dst,
-                                         ptrdiff_t dst_stride,
-                                         const InterpKernel *filter, int x0_q4,
-                                         int x_step_q4, int y0_q4,
-                                         int y_step_q4, int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
+static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x16_t range_limit) {
   uint8x16_t s0, s1, s2, s3;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-  assert(h % 4 == 3);
+  if (w == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
 
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-  src -= 3;
+      d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+    d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+    d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+      d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+      d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x16_t range_limit) {
+  uint8x16_t s0, s1, s2, s3;
 
   if (w == 4) {
     const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
@@ -71,10 +147,10 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
     do {
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-      d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-      d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
-      d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
@@ -90,9 +166,9 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
      * further details on possible values of block height. */
     load_u8_16x3(src, src_stride, &s0, &s1, &s2);
 
-    d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-    d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-    d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+    d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+    d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+    d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
     d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
     d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
 
@@ -112,10 +188,10 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -136,9 +212,9 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
     do {
       load_u8_16x3(s, src_stride, &s0, &s1, &s2);
 
-      d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-      d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-      d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+      d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+      d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+      d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
 
       store_u8_8x3(d, dst_stride, d0, d1, d2);
 
@@ -149,6 +225,44 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
   }
 }
 
+void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h) {
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int32x4_t correction_8tap =
+      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t x_filter_4tap =
+        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
+    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
+    vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst,
+                                            dst_stride, w, h, x_filter_4tap,
+                                            correction_4tap, range_limit);
+
+  } else {
+    vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst,
+                                            dst_stride, w, h, x_filter_8tap,
+                                            correction_8tap, range_limit);
+  }
+}
+
 static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,

From 0dc67ecf54e11c7fa68dad6e79916b0d24c02f3a Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 27 Nov 2023 14:47:15 +0000
Subject: [PATCH 886/926] Specialise Armv8.6 Neon horiz convolution for 4-tap
 filters

Add an Armv8.6 USDOT Neon implementation of horizontal convolution
specialised for executing with 4-tap filters (the most common filter
size for settings --good --cpu-used=1.) This new path is also used
when executing with bilinear (2-tap) filters.

Change-Id: I8f7633d9852ebfe8feb9b4a055715f849cccf297
---
 vpx_dsp/arm/vpx_convolve8_neon.h      |  38 ++++++++
 vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 120 +++++++++++++++++++++-----
 2 files changed, 137 insertions(+), 21 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index ca9f816bc4..031b9eb852 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -215,6 +215,20 @@ static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples,
   return vmovn_s32(sum);
 }
 
+static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16_t permute_tbl) {
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vmovn_s32(sum);
+}
+
 static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
                                                   const int8x8_t filters) {
@@ -230,6 +244,30 @@ static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
+static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x2_t permute_tbl) {
+  uint8x16_t permuted_samples[2];
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  /* First 4 output values. */
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  /* Second 4 output values. */
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+  /* Narrow and re-pack. */
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
 static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
                                                   const int8x8_t filters) {
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
index 7c394a937b..dd60fa5169 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -145,23 +145,70 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *filter, int x0_q4,
-                                   int x_step_q4, int y0_q4, int y_step_q4,
-                                   int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   uint8x16_t s0, s1, s2, s3;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
+  if (w == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
 
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-  src -= 3;
+      t0 = convolve4_4_usdot(s0, filter, perm_tbl);
+      t1 = convolve4_4_usdot(s1, filter, perm_tbl);
+      t2 = convolve4_4_usdot(s2, filter, perm_tbl);
+      t3 = convolve4_4_usdot(s3, filter, perm_tbl);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve4_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve4_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve4_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve4_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  uint8x16_t s0, s1, s2, s3;
 
   if (w == 4) {
     const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
@@ -171,10 +218,10 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
       d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
 
@@ -199,10 +246,10 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -217,6 +264,37 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h) {
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t x_filter_4tap =
+        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
+    vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w,
+                                      h, x_filter_4tap);
+
+  } else {
+    vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w,
+                                      h, x_filter_8tap);
+  }
+}
+
 void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                        uint8_t *dst, ptrdiff_t dst_stride,
                                        const InterpKernel *filter, int x0_q4,

From cc89450a4821dda0aef22ea36c18298b84de93f4 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 27 Nov 2023 14:55:55 +0000
Subject: [PATCH 887/926] Specialise Armv8.6 Neon 2D horiz convolution for
 4-tap filters

Add an Armv8.6 USDOT Neon path for the horizontal portion of 2D
convolution, specialised for executing with 4-tap filters (the most
common filter size for settings --good --cpu-used=1.) This new path
is also used when executing with bilinear (2-tap) filters.

Change-Id: I455e5a94bdcea1358025bd8e4d4c8c62e373aa5d
---
 vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 166 +++++++++++++++++++++-----
 1 file changed, 138 insertions(+), 28 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
index dd60fa5169..bcad1dd121 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -40,24 +40,103 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *filter, int x0_q4,
-                                      int x_step_q4, int y0_q4, int y_step_q4,
-                                      int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   uint8x16_t s0, s1, s2, s3;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-  assert(h % 4 == 3);
+  if (w == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
 
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-  src -= 3;
+      d0 = convolve4_4_usdot(s0, filter, perm_tbl);
+      d1 = convolve4_4_usdot(s1, filter, perm_tbl);
+      d2 = convolve4_4_usdot(s2, filter, perm_tbl);
+      d3 = convolve4_4_usdot(s3, filter, perm_tbl);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve4_4_usdot(s0, filter, perm_tbl);
+    d1 = convolve4_4_usdot(s1, filter, perm_tbl);
+    d2 = convolve4_4_usdot(s2, filter, perm_tbl);
+    /* We halved the filter values so -1 from right shift. */
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve4_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve4_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve4_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve4_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve4_8_usdot(s0, filter, perm_tbl);
+      d1 = convolve4_8_usdot(s1, filter, perm_tbl);
+      d2 = convolve4_8_usdot(s2, filter, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  uint8x16_t s0, s1, s2, s3;
 
   if (w == 4) {
     const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
@@ -67,10 +146,10 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     do {
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      d0 = convolve8_4_usdot(s0, filters, perm_tbl);
-      d1 = convolve8_4_usdot(s1, filters, perm_tbl);
-      d2 = convolve8_4_usdot(s2, filters, perm_tbl);
-      d3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d0 = convolve8_4_usdot(s0, filter, perm_tbl);
+      d1 = convolve8_4_usdot(s1, filter, perm_tbl);
+      d2 = convolve8_4_usdot(s2, filter, perm_tbl);
+      d3 = convolve8_4_usdot(s3, filter, perm_tbl);
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
@@ -86,9 +165,9 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
      * further details on possible values of block height. */
     load_u8_16x3(src, src_stride, &s0, &s1, &s2);
 
-    d0 = convolve8_4_usdot(s0, filters, perm_tbl);
-    d1 = convolve8_4_usdot(s1, filters, perm_tbl);
-    d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+    d0 = convolve8_4_usdot(s0, filter, perm_tbl);
+    d1 = convolve8_4_usdot(s1, filter, perm_tbl);
+    d2 = convolve8_4_usdot(s2, filter, perm_tbl);
     d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
     d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
 
@@ -108,10 +187,10 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -132,9 +211,9 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     do {
       load_u8_16x3(s, src_stride, &s0, &s1, &s2);
 
-      d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-      d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-      d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+      d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+      d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+      d2 = convolve8_8_usdot(s2, filter, perm_tbl);
 
       store_u8_8x3(d, dst_stride, d0, d1, d2);
 
@@ -145,6 +224,37 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t x_filter_4tap =
+        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
+    vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride,
+                                         w, h, x_filter_4tap);
+
+  } else {
+    vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride,
+                                         w, h, x_filter_8tap);
+  }
+}
+
 static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {

From d7358ed53a6b344f6f374b3a5e8fe3d207bcc720 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Sun, 26 Nov 2023 20:38:01 -0800
Subject: [PATCH 888/926] Unitest for issue: b/310477034

Fix is made here:
https://chromium-review.googlesource.com/c/webm/libvpx/+/5055827

Bug: b/310477034
Change-Id: Id1cc7a6a95e1ea5d1a022f36d7971915c9918339
---
 test/encode_api_test.cc | 86 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index d2469572a6..aa2d28b6da 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -514,6 +514,22 @@ vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
   return image;
 }
 
+void CodecEncodeFrame(vpx_codec_ctx_t *enc, const int width, const int height,
+                      const int frame_index, unsigned int deadline,
+                      const bool is_key) {
+  const vpx_codec_cx_pkt_t *pkt;
+  vpx_image_t *image = CreateImage(width, height);
+  vpx_enc_frame_flags_t frame_flags = is_key ? VPX_EFLAG_FORCE_KF : 0;
+  ASSERT_NE(image, nullptr);
+  ASSERT_EQ(vpx_codec_encode(enc, image, frame_index, 1, frame_flags, deadline),
+            VPX_CODEC_OK);
+  vpx_codec_iter_t iter = nullptr;
+  while ((pkt = vpx_codec_get_cx_data(enc, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+  }
+  vpx_img_free(image);
+}
+
 // This is a test case from clusterfuzz.
 TEST(EncodeAPI, PrevMiCheckNullptr) {
   vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
@@ -581,6 +597,76 @@ TEST(EncodeAPI, PrevMiCheckNullptr) {
   ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
 }
 
+// This is a test case from clusterfuzz: based on 310477034.
+// Encode a few frames with multiple change config call
+// with different frame size.
+TEST(EncodeAPI, MultipleChangeConfigResize) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+  vpx_codec_enc_cfg_t cfg;
+
+  struct Config {
+    unsigned int thread;
+    unsigned int width;
+    unsigned int height;
+    vpx_rc_mode end_usage;
+    unsigned long deadline;
+  };
+
+  // Set initial config.
+  struct Config config = { 3, 41, 1, VPX_VBR, 1 };
+  unsigned long deadline = config.deadline;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
+            VPX_CODEC_OK);
+  cfg.g_threads = config.thread;
+  cfg.g_w = config.width;
+  cfg.g_h = config.height;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 1000 * 1000;  // microseconds
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = config.end_usage;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 58;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 3), VPX_CODEC_OK);
+  int frame_index = 0;
+
+  // Encode first frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
+  frame_index++;
+
+  // Change config.
+  config = { 16, 31, 1, VPX_VBR, 1000000 };
+  cfg.g_threads = config.thread;
+  cfg.g_w = config.width;
+  cfg.g_h = config.height;
+  cfg.rc_end_usage = config.end_usage;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc);
+
+  // Change config again.
+  config = { 0, 17, 1, VPX_CBR, 1 };
+  cfg.g_threads = config.thread;
+  cfg.g_w = config.width;
+  cfg.g_h = config.height;
+  cfg.rc_end_usage = config.end_usage;
+  deadline = config.deadline;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc);
+
+  // Encode 2nd frame with new config, set delta frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
+  frame_index++;
+
+  // Encode 3rd frame with same config, set delta frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
+  frame_index++;
+
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {

From adebf364cb7ea098a366108acae8cf01595388fa Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 21 Nov 2023 14:00:16 -0800
Subject: [PATCH 889/926] rtc: Set nonrd keyframe under dynamic change of
 deadline

For realtime mode: if the deadline mode (good/best/realtime)
is changed on the fly (via codec_encode() call), force a
key frame and set the speed feature nonrd_keyframe = 1 to
avoid entering the rd pickmode.

nonrd_pickmode=0/off is the only feature in realtime mode that
involves rd pickmode, so by forcing it on/1 we can cleanly
separate nonrd (realtime) from rd (good/best), so we can
avoid possible issues on this dynamic mode switch, such as in
bug listed below.

Dynamic change of deadline, in particular for realtime mode,
involves a lot of coding/speed feature changes, so best to
also force reset with keyframe.

Added unitest that triggers the issue in the bug.
Bug: b/310663186

Change-Id: Idf8fd7c9ee54b301968184be5481ee9faa06468d
---
 test/encode_api_test.cc          | 94 ++++++++++++++++++++++++++++++++
 vp9/encoder/vp9_encoder.c        |  5 ++
 vp9/encoder/vp9_encoder.h        |  4 ++
 vp9/encoder/vp9_ratectrl.c       | 15 +++--
 vp9/encoder/vp9_speed_features.c |  5 ++
 5 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index aa2d28b6da..f48c9a106f 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -667,6 +667,100 @@ TEST(EncodeAPI, MultipleChangeConfigResize) {
   ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
 }
 
+// This is a test case from clusterfuzz: based on b/310663186.
+// Encode set of frames while varying the deadline on the fly from
+// good to realtime to best and back to realtime.
+TEST(EncodeAPI, DynamicDeadlineChange) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+  vpx_codec_enc_cfg_t cfg;
+
+  struct Config {
+    unsigned int thread;
+    unsigned int width;
+    unsigned int height;
+    vpx_rc_mode end_usage;
+    unsigned long deadline;
+  };
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  struct Config config = { 0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY };
+  unsigned long deadline = config.deadline;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
+            VPX_CODEC_OK);
+  cfg.g_threads = config.thread;
+  cfg.g_w = config.width;
+  cfg.g_h = config.height;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 1000 * 1000;  // microseconds
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = config.end_usage;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 58;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+  // Use realtime speed: 5 to 9.
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 5), VPX_CODEC_OK);
+  int frame_index = 0;
+
+  // Encode 1st frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
+  frame_index++;
+
+  // Encode 2nd frame, delta frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
+  frame_index++;
+
+  // Change config: change deadline to REALTIME.
+  config = { 0, 1, 1, VPX_VBR, VPX_DL_REALTIME };
+  deadline = config.deadline;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc);
+
+  // Encode 3rd frame with new config, set key frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
+  frame_index++;
+
+  // Encode 4th frame with same config, delta frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
+  frame_index++;
+
+  // Encode 5th frame with same config, key frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
+  frame_index++;
+
+  // Change config: change deadline to BEST.
+  config = { 0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY };
+  deadline = config.deadline;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc);
+
+  // Encode 6th frame with new config, set delta frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
+  frame_index++;
+
+  // Change config: change deadline to REALTIME.
+  config = { 0, 1, 1, VPX_VBR, VPX_DL_REALTIME };
+  deadline = config.deadline;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc);
+
+  // Encode 7th frame with new config, set delta frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
+  frame_index++;
+
+  // Encode 8th frame with new config, set key frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
+  frame_index++;
+
+  // Encode 9th frame with new config, set delta frame.
+  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
+  frame_index++;
+
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e1a4d986b7..e27a77e2e1 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5535,6 +5535,11 @@ static void encode_frame_to_data_rate(
     set_ref_sign_bias(cpi);
   }
 
+  // On the very first frame set the deadline_mode_previous_frame to
+  // the current mode.
+  if (cpi->common.current_video_frame == 0)
+    cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
+
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 7b02fe7f6b..171489358b 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1037,6 +1037,10 @@ typedef struct VP9_COMP {
 
   int fixed_qp_onepass;
 
+  // Flag to keep track of dynamic change in deadline mode
+  // (good/best/realtime).
+  MODE deadline_mode_previous_frame;
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
   /*!
    * component_time[] are initialized to zero while encoder starts.
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index e02b2892ac..aa77b7cbad 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1991,6 +1991,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
   if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1)
     svc->lower_layer_qindex = cm->base_qindex;
+  cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -2011,6 +2012,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
     cpi->rc.buffer_level = cpi->rc.optimal_buffer_level;
     cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level;
   }
+  cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 }
 
 int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
@@ -2118,7 +2120,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
   int target;
   if (!cpi->refresh_alt_ref_frame &&
       (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0)) {
+       rc->frames_to_key == 0 ||
+       (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) {
     cm->frame_type = KEY_FRAME;
     rc->this_key_frame_forced =
         cm->current_video_frame != 0 && rc->frames_to_key == 0;
@@ -2284,14 +2287,15 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   // Periodic key frames is based on the super-frame counter
   // (svc.current_superframe), also only base spatial layer is key frame.
   // Key frame is set for any of the following: very first frame, frame flags
-  // indicates key, superframe counter hits key frequency, or (non-intra) sync
-  // flag is set for spatial layer 0.
+  // indicates key, superframe counter hits key frequency,(non-intra) sync
+  // flag is set for spatial layer 0, or deadline mode changes.
   if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) ||
       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
       (cpi->oxcf.auto_key &&
        (svc->current_superframe % cpi->oxcf.key_freq == 0) &&
        !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) ||
-      (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
+      (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) ||
+      (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
     if (is_one_pass_svc(cpi)) {
@@ -2490,7 +2494,8 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
   if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && rc->frames_to_key == 0)) {
+      (cpi->oxcf.auto_key && rc->frames_to_key == 0) ||
+      (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
     cm->frame_type = KEY_FRAME;
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 4a7172118c..56fb5f94f4 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -859,6 +859,11 @@ static void set_rt_speed_feature_framesize_independent(
   // off for now.
   if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     cpi->oxcf.aq_mode = 0;
+  // For all speeds for rt mode: if the deadline mode changed (was good/best
+  // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to
+  // avoid entering rd pickmode. This causes issues, such as: b/310663186.
+  if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)
+    sf->nonrd_keyframe = 1;
 }
 
 void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) {

From 57b72fe80782ad294c04a7c3c278325f7b5bde54 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 29 Nov 2023 14:22:10 -0800
Subject: [PATCH 890/926] Add VP9Encoder class to simplify fuzz test cases

Bug: b:306422625
Change-Id: I8344cb7fb4e1aee87d46f683746517cdcddf5c5d
---
 test/encode_api_test.cc | 274 ++++++++++++++--------------------------
 1 file changed, 93 insertions(+), 181 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 739e61a6e6..da3676f6bf 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -514,17 +514,75 @@ vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
   return image;
 }
 
-void CodecEncodeFrame(vpx_codec_ctx_t *enc, const int width, const int height,
-                      const int frame_index, unsigned int deadline,
-                      const bool is_key) {
+// Emulates the WebCodecs VideoEncoder interface.
+class VP9Encoder {
+ public:
+  VP9Encoder(int speed) : speed_(speed) {}
+  ~VP9Encoder();
+
+  void Configure(unsigned int threads, unsigned int width, unsigned int height,
+                 vpx_rc_mode end_usage, unsigned long deadline);
+  void Encode(bool key_frame);
+
+ private:
+  const int speed_;
+  bool initialized_ = false;
+  vpx_codec_enc_cfg_t cfg_;
+  vpx_codec_ctx_t enc_;
+  int frame_index_ = 0;
+  unsigned long deadline_ = 0;
+};
+
+VP9Encoder::~VP9Encoder() {
+  if (initialized_) {
+    EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+  }
+}
+
+void VP9Encoder::Configure(unsigned int threads, unsigned int width,
+                           unsigned int height, vpx_rc_mode end_usage,
+                           unsigned long deadline) {
+  deadline_ = deadline;
+
+  if (!initialized_) {
+    vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+              VPX_CODEC_OK);
+    cfg_.g_threads = threads;
+    cfg_.g_w = width;
+    cfg_.g_h = height;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 1000 * 1000;  // microseconds
+    cfg_.g_pass = VPX_RC_ONE_PASS;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = end_usage;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 58;
+    ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+    initialized_ = true;
+    return;
+  }
+
+  cfg_.g_threads = threads;
+  cfg_.g_w = width;
+  cfg_.g_h = height;
+  cfg_.rc_end_usage = end_usage;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc_);
+}
+
+void VP9Encoder::Encode(bool key_frame) {
   const vpx_codec_cx_pkt_t *pkt;
-  vpx_image_t *image = CreateImage(width, height);
-  vpx_enc_frame_flags_t frame_flags = is_key ? VPX_EFLAG_FORCE_KF : 0;
+  vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h);
   ASSERT_NE(image, nullptr);
-  ASSERT_EQ(vpx_codec_encode(enc, image, frame_index, 1, frame_flags, deadline),
-            VPX_CODEC_OK);
+  const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_),
+      VPX_CODEC_OK);
+  frame_index_++;
   vpx_codec_iter_t iter = nullptr;
-  while ((pkt = vpx_codec_get_cx_data(enc, &iter)) != nullptr) {
+  while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
     ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
   }
   vpx_img_free(image);
@@ -532,233 +590,87 @@ void CodecEncodeFrame(vpx_codec_ctx_t *enc, const int width, const int height,
 
 // This is a test case from clusterfuzz.
 TEST(EncodeAPI, PrevMiCheckNullptr) {
-  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
-  vpx_codec_enc_cfg_t cfg;
+  VP9Encoder encoder(0);
+  encoder.Configure(0, 1554, 644, VPX_VBR, VPX_DL_REALTIME);
 
-  struct Config {
-    unsigned int thread;
-    unsigned int width;
-    unsigned int height;
-    vpx_rc_mode end_usage;
-    unsigned long deadline;
-  };
-  struct Config init_config = { 0, 1554, 644, VPX_VBR, 1 };
-  unsigned long deadline = init_config.deadline;
-  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
-            VPX_CODEC_OK);
-  cfg.g_threads = init_config.thread;
-  cfg.g_w = init_config.width;
-  cfg.g_h = init_config.height;
-  cfg.g_timebase.num = 1;
-  cfg.g_timebase.den = 1000 * 1000;  // microseconds
-  cfg.g_pass = VPX_RC_ONE_PASS;
-  cfg.g_lag_in_frames = 0;
-  cfg.rc_end_usage = init_config.end_usage;
-  cfg.rc_min_quantizer = 2;
-  cfg.rc_max_quantizer = 58;
-
-  vpx_codec_ctx_t enc;
-  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
-  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 0), VPX_CODEC_OK);
-
-  const vpx_codec_cx_pkt_t *pkt;
-
-  int frame_index = 0;
   // First step: encode, without forcing KF.
-  vpx_image_t *image = CreateImage(cfg.g_w, cfg.g_h);
-  ASSERT_NE(image, nullptr);
-  ASSERT_EQ(vpx_codec_encode(&enc, image, frame_index, 1, 0, deadline),
-            VPX_CODEC_OK);
-  frame_index++;
-  vpx_codec_iter_t iter = nullptr;
-  while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
-    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
-  }
-  vpx_img_free(image);
+  encoder.Encode(false);
   // Second step: change config
-  struct Config encode_config = { 0, 1131, 644, VPX_CBR, 1000000 };
-  cfg.g_threads = encode_config.thread;
-  cfg.g_w = encode_config.width;
-  cfg.g_h = encode_config.height;
-  cfg.rc_end_usage = encode_config.end_usage;
-  deadline = encode_config.deadline;
-  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
-      << vpx_codec_error_detail(&enc);
+  encoder.Configure(0, 1131, 644, VPX_CBR, VPX_DL_GOOD_QUALITY);
   // Third step: encode, without forcing KF
-  image = CreateImage(cfg.g_w, cfg.g_h);
-  ASSERT_NE(image, nullptr);
-  ASSERT_EQ(vpx_codec_encode(&enc, image, frame_index, 1, 0, deadline),
-            VPX_CODEC_OK);
-  frame_index++;
-  while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
-    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
-  }
-  vpx_img_free(image);
-  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  encoder.Encode(false);
 }
 
 // This is a test case from clusterfuzz: based on 310477034.
 // Encode a few frames with multiple change config call
 // with different frame size.
 TEST(EncodeAPI, MultipleChangeConfigResize) {
-  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
-  vpx_codec_enc_cfg_t cfg;
-
-  struct Config {
-    unsigned int thread;
-    unsigned int width;
-    unsigned int height;
-    vpx_rc_mode end_usage;
-    unsigned long deadline;
-  };
+  VP9Encoder encoder(3);
 
   // Set initial config.
-  struct Config config = { 3, 41, 1, VPX_VBR, 1 };
-  unsigned long deadline = config.deadline;
-  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
-            VPX_CODEC_OK);
-  cfg.g_threads = config.thread;
-  cfg.g_w = config.width;
-  cfg.g_h = config.height;
-  cfg.g_timebase.num = 1;
-  cfg.g_timebase.den = 1000 * 1000;  // microseconds
-  cfg.g_pass = VPX_RC_ONE_PASS;
-  cfg.g_lag_in_frames = 0;
-  cfg.rc_end_usage = config.end_usage;
-  cfg.rc_min_quantizer = 2;
-  cfg.rc_max_quantizer = 58;
-
-  vpx_codec_ctx_t enc;
-  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
-  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 3), VPX_CODEC_OK);
-  int frame_index = 0;
+  encoder.Configure(3, 41, 1, VPX_VBR, VPX_DL_REALTIME);
 
   // Encode first frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
-  frame_index++;
+  encoder.Encode(true);
 
   // Change config.
-  config = { 16, 31, 1, VPX_VBR, 1000000 };
-  cfg.g_threads = config.thread;
-  cfg.g_w = config.width;
-  cfg.g_h = config.height;
-  cfg.rc_end_usage = config.end_usage;
-  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
-      << vpx_codec_error_detail(&enc);
+  encoder.Configure(16, 31, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
 
   // Change config again.
-  config = { 0, 17, 1, VPX_CBR, 1 };
-  cfg.g_threads = config.thread;
-  cfg.g_w = config.width;
-  cfg.g_h = config.height;
-  cfg.rc_end_usage = config.end_usage;
-  deadline = config.deadline;
-  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
-      << vpx_codec_error_detail(&enc);
+  encoder.Configure(0, 17, 1, VPX_CBR, VPX_DL_REALTIME);
 
   // Encode 2nd frame with new config, set delta frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
-  frame_index++;
+  encoder.Encode(false);
 
   // Encode 3rd frame with same config, set delta frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
-  frame_index++;
-
-  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  encoder.Encode(false);
 }
 
 // This is a test case from clusterfuzz: based on b/310663186.
 // Encode set of frames while varying the deadline on the fly from
 // good to realtime to best and back to realtime.
 TEST(EncodeAPI, DynamicDeadlineChange) {
-  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
-  vpx_codec_enc_cfg_t cfg;
-
-  struct Config {
-    unsigned int thread;
-    unsigned int width;
-    unsigned int height;
-    vpx_rc_mode end_usage;
-    unsigned long deadline;
-  };
+  // Use realtime speed: 5 to 9.
+  VP9Encoder encoder(5);
 
   // Set initial config, in particular set deadline to GOOD mode.
-  struct Config config = { 0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY };
-  unsigned long deadline = config.deadline;
-  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
-            VPX_CODEC_OK);
-  cfg.g_threads = config.thread;
-  cfg.g_w = config.width;
-  cfg.g_h = config.height;
-  cfg.g_timebase.num = 1;
-  cfg.g_timebase.den = 1000 * 1000;  // microseconds
-  cfg.g_pass = VPX_RC_ONE_PASS;
-  cfg.g_lag_in_frames = 0;
-  cfg.rc_end_usage = config.end_usage;
-  cfg.rc_min_quantizer = 2;
-  cfg.rc_max_quantizer = 58;
-
-  vpx_codec_ctx_t enc;
-  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
-  // Use realtime speed: 5 to 9.
-  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 5), VPX_CODEC_OK);
-  int frame_index = 0;
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
 
   // Encode 1st frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
-  frame_index++;
+  encoder.Encode(true);
 
   // Encode 2nd frame, delta frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
-  frame_index++;
+  encoder.Encode(false);
 
   // Change config: change deadline to REALTIME.
-  config = { 0, 1, 1, VPX_VBR, VPX_DL_REALTIME };
-  deadline = config.deadline;
-  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
-      << vpx_codec_error_detail(&enc);
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
 
   // Encode 3rd frame with new config, set key frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
-  frame_index++;
+  encoder.Encode(true);
 
   // Encode 4th frame with same config, delta frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
-  frame_index++;
+  encoder.Encode(false);
 
   // Encode 5th frame with same config, key frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
-  frame_index++;
+  encoder.Encode(true);
 
   // Change config: change deadline to BEST.
-  config = { 0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY };
-  deadline = config.deadline;
-  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
-      << vpx_codec_error_detail(&enc);
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY);
 
   // Encode 6th frame with new config, set delta frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
-  frame_index++;
+  encoder.Encode(false);
 
   // Change config: change deadline to REALTIME.
-  config = { 0, 1, 1, VPX_VBR, VPX_DL_REALTIME };
-  deadline = config.deadline;
-  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK)
-      << vpx_codec_error_detail(&enc);
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
 
   // Encode 7th frame with new config, set delta frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
-  frame_index++;
+  encoder.Encode(false);
 
   // Encode 8th frame with new config, set key frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true);
-  frame_index++;
+  encoder.Encode(true);
 
   // Encode 9th frame with new config, set delta frame.
-  CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false);
-  frame_index++;
-
-  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  encoder.Encode(false);
 }
 
 class EncodeApiGetTplStatsTest

From fe5dc9f7fc2f98709184dace2308d51265f2800d Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 10 Nov 2023 20:38:31 -0500
Subject: [PATCH 891/926] vp9 rc: support screen content

Bug: b/281463780
Change-Id: I23668550257b28031bdca0537459f93ec63f1e2e
---
 test/vp9_ratectrl_rtc_test.cc   | 21 ++++++++++++++++++++-
 vp9/encoder/vp9_encoder.c       |  1 +
 vp9/encoder/vp9_encoder.h       |  3 +++
 vp9/encoder/vp9_ratectrl.c      |  3 ++-
 vp9/ratectrl_rtc.cc             |  1 +
 vp9/vp9_cx_iface.c              |  1 +
 vpx/internal/vpx_ratectrl_rtc.h |  2 ++
 7 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index ff718bbaa7..f7be47542c 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -54,7 +54,11 @@ class RcInterfaceTest
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
-      encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
+      if (rc_cfg_.is_screen) {
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN);
+      } else {
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT);
+      }
       encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
       encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
     }
@@ -101,6 +105,19 @@ class RcInterfaceTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunOneLayerScreen() {
+    SetConfig(GET_PARAM(2));
+    rc_cfg_.is_screen = true;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
   void RunOneLayerDropFramesCBR() {
     if (GET_PARAM(2) != VPX_CBR) {
       GTEST_SKIP() << "Frame dropping is only for CBR mode.";
@@ -632,6 +649,8 @@ TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
 TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); }
 
+TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
 TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e27a77e2e1..be55150140 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4065,6 +4065,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   cpi->rc.hybrid_intra_scene_change = 0;
   cpi->rc.re_encode_maxq_scene_change = 0;
   if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
+      !cpi->disable_scene_detection_rtc_ratectrl &&
       (cpi->oxcf.rc_mode == VPX_VBR ||
        cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
        (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 171489358b..160de0064f 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1041,6 +1041,9 @@ typedef struct VP9_COMP {
   // (good/best/realtime).
   MODE deadline_mode_previous_frame;
 
+  // Flag to disable scene detection when rtc rate control library is used.
+  int disable_scene_detection_rtc_ratectrl;
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
   /*!
    * component_time[] are initialized to zero while encoder starts.
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index aa77b7cbad..6452e349df 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -680,7 +680,8 @@ static int adjust_q_cbr(const VP9_COMP *cpi, int q) {
     else
       q = qclamp;
   }
-  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     vp9_cyclic_refresh_limit_q(cpi, &q);
   return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
 }
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index d8239718a8..fd81bce7b5 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -131,6 +131,7 @@ bool VP9RateControlRTC::UpdateRateControl(
   oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
   oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT;
   oxcf->ss_number_layers = rc_cfg.ss_number_layers;
   oxcf->ts_number_layers = rc_cfg.ts_number_layers;
   oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)(
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b1dfe992cf..b75fc18fd2 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1077,6 +1077,7 @@ static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
     cpi->compute_frame_low_motion_onepass = 0;
     cpi->rc.constrain_gf_key_freq_onepass_vbr = 0;
     cpi->cyclic_refresh->content_mode = 0;
+    cpi->disable_scene_detection_rtc_ratectrl = 1;
   }
   return VPX_CODEC_OK;
 }
diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h
index 6ffd798eb2..01d64b14b7 100644
--- a/vpx/internal/vpx_ratectrl_rtc.h
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -43,6 +43,7 @@ struct VpxRateControlRtcConfig {
     layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
     ts_rate_decimator[0] = 1;
     frame_drop_thresh = 0;
+    is_screen = false;
   }
 
   int width;
@@ -67,6 +68,7 @@ struct VpxRateControlRtcConfig {
   enum vpx_rc_mode rc_mode;
   int aq_mode;
   int frame_drop_thresh;
+  bool is_screen;
 };
 }  // namespace libvpx
 #endif  // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_

From 97a0d139ce077d02a64a5bba1870104a43a0217f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 29 Nov 2023 16:47:34 -0800
Subject: [PATCH 892/926] psnr.h,cosmetics: fix a typo (PNSR -> PSNR)

Change-Id: I2adea9f150852c106acc57e5aeeac571d6bd15fb
---
 vpx_dsp/psnr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h
index 9ebb64dd52..7c57aa429f 100644
--- a/vpx_dsp/psnr.h
+++ b/vpx_dsp/psnr.h
@@ -26,7 +26,7 @@ typedef struct vpx_psnr_pkt PSNR_STATS;
 
 /*!\brief Converts SSE to PSNR
  *
- * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR).
  *
  * \param[in]    samples       Number of samples
  * \param[in]    peak          Max sample value

From b027590c30d9bd5dda3f72aff7993fd94fd28813 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 29 Nov 2023 12:18:08 -0800
Subject: [PATCH 893/926] Define vpx_enc_deadline_t type for encode deadline

The deadline parameter of vpx_codec_encode() is of the unsigned long
type. The cpplint runtime/int check and the clang-tidy
google-runtime-int warn about the use of the unsigned long type. Adding
a type alias works around this issue.

Note: vpx_codec_decode() also has a deadline parameter, but it is of the
long type. So unfortuntely this type alias cannot be simply named
vpx_codec_deadline_t and the name must suggest it is encoder-specific.

Change-Id: I27b6b25730b620b328422ec3f91e63fdc55b377a
---
 test/codec_factory.h              | 10 +++++-----
 test/encode_api_test.cc           |  6 +++---
 test/encode_test_driver.h         |  8 ++++----
 test/frame_size_tests.cc          |  3 +--
 test/keyframe_test.cc             |  4 ++--
 vp8/vp8_cx_iface.c                |  6 +++---
 vp9/vp9_cx_iface.c                |  4 ++--
 vpx/internal/vpx_codec_internal.h |  2 +-
 vpx/src/vpx_encoder.c             |  2 +-
 vpx/vpx_encoder.h                 |  8 +++++++-
 10 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/test/codec_factory.h b/test/codec_factory.h
index d00563df1c..c7e8f54847 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -40,7 +40,7 @@ class CodecFactory {
                                  const vpx_codec_flags_t flags) const = 0;
 
   virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
+                                 vpx_enc_deadline_t deadline,
                                  const unsigned long init_flags,
                                  TwopassStatsStore *stats) const = 0;
 
@@ -95,7 +95,7 @@ class VP8Decoder : public Decoder {
 
 class VP8Encoder : public Encoder {
  public:
-  VP8Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  VP8Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
              const unsigned long init_flags, TwopassStatsStore *stats)
       : Encoder(cfg, deadline, init_flags, stats) {}
 
@@ -128,7 +128,7 @@ class VP8CodecFactory : public CodecFactory {
 #endif
   }
 
-  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
                          const unsigned long init_flags,
                          TwopassStatsStore *stats) const override {
 #if CONFIG_VP8_ENCODER
@@ -190,7 +190,7 @@ class VP9Decoder : public Decoder {
 
 class VP9Encoder : public Encoder {
  public:
-  VP9Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  VP9Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
              const unsigned long init_flags, TwopassStatsStore *stats)
       : Encoder(cfg, deadline, init_flags, stats) {}
 
@@ -223,7 +223,7 @@ class VP9CodecFactory : public CodecFactory {
 #endif
   }
 
-  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
                          const unsigned long init_flags,
                          TwopassStatsStore *stats) const override {
 #if CONFIG_VP9_ENCODER
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index da3676f6bf..462427fe15 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -521,7 +521,7 @@ class VP9Encoder {
   ~VP9Encoder();
 
   void Configure(unsigned int threads, unsigned int width, unsigned int height,
-                 vpx_rc_mode end_usage, unsigned long deadline);
+                 vpx_rc_mode end_usage, vpx_enc_deadline_t deadline);
   void Encode(bool key_frame);
 
  private:
@@ -530,7 +530,7 @@ class VP9Encoder {
   vpx_codec_enc_cfg_t cfg_;
   vpx_codec_ctx_t enc_;
   int frame_index_ = 0;
-  unsigned long deadline_ = 0;
+  vpx_enc_deadline_t deadline_ = 0;
 };
 
 VP9Encoder::~VP9Encoder() {
@@ -541,7 +541,7 @@ VP9Encoder::~VP9Encoder() {
 
 void VP9Encoder::Configure(unsigned int threads, unsigned int width,
                            unsigned int height, vpx_rc_mode end_usage,
-                           unsigned long deadline) {
+                           vpx_enc_deadline_t deadline) {
   deadline_ = deadline;
 
   if (!initialized_) {
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index c7974894c7..7dd80d6664 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -86,7 +86,7 @@ class TwopassStatsStore {
 // level of abstraction will be fleshed out as more tests are written.
 class Encoder {
  public:
-  Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
           const unsigned long init_flags, TwopassStatsStore *stats)
       : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) {
     memset(&encoder_, 0, sizeof(encoder_));
@@ -177,7 +177,7 @@ class Encoder {
     cfg_ = *cfg;
   }
 
-  void set_deadline(unsigned long deadline) { deadline_ = deadline; }
+  void set_deadline(vpx_enc_deadline_t deadline) { deadline_ = deadline; }
 
  protected:
   virtual vpx_codec_iface_t *CodecInterface() const = 0;
@@ -196,7 +196,7 @@ class Encoder {
 
   vpx_codec_ctx_t encoder_;
   vpx_codec_enc_cfg_t cfg_;
-  unsigned long deadline_;
+  vpx_enc_deadline_t deadline_;
   unsigned long init_flags_;
   TwopassStatsStore *stats_;
 };
@@ -291,7 +291,7 @@ class EncoderTest {
   vpx_codec_enc_cfg_t cfg_;
   vpx_codec_dec_cfg_t dec_cfg_;
   unsigned int passes_;
-  unsigned long deadline_;
+  vpx_enc_deadline_t deadline_;
   TwopassStatsStore stats_;
   unsigned long init_flags_;
   vpx_enc_frame_flags_t frame_flags_;
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 7b6c29a88f..eea5647a78 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -18,8 +18,7 @@ namespace {
 
 class EncoderWithExpectedError : public ::libvpx_test::Encoder {
  public:
-  EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg,
-                           unsigned long deadline,          // NOLINT
+  EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
                            const unsigned long init_flags,  // NOLINT
                            ::libvpx_test::TwopassStatsStore *stats)
       : ::libvpx_test::Encoder(cfg, deadline, init_flags, stats) {}
diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc
index 5292bb188d..6a1c99cbe2 100644
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -178,7 +178,7 @@ vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w,
 
 // Tests kf_max_dist in one-pass encoding with zero lag.
 void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface,
-                                 unsigned long deadline,
+                                 vpx_enc_deadline_t deadline,
                                  unsigned int kf_max_dist) {
   vpx_codec_enc_cfg_t cfg;
   ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
@@ -241,7 +241,7 @@ TEST(KeyframeIntervalTest, KeyframeMaximumInterval) {
   ifaces.push_back(vpx_codec_vp9_cx());
 #endif
   for (vpx_codec_iface_t *iface : ifaces) {
-    for (unsigned long deadline :
+    for (vpx_enc_deadline_t deadline :
          { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) {
       // Test 0 and 1 (both mean all intra), some powers of 2, some multiples
       // of 10, and some prime numbers.
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index a6f0b4cbcf..c42a837ebe 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -12,7 +12,7 @@
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
 #include "vpx_mem/vpx_mem.h"
@@ -776,7 +776,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
 
 static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long duration,
-                                    unsigned long deadline) {
+                                    vpx_enc_deadline_t deadline) {
   int new_qc;
 
 #if !(CONFIG_REALTIME_ONLY)
@@ -866,7 +866,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                    const vpx_image_t *img, vpx_codec_pts_t pts,
                                    unsigned long duration,
                                    vpx_enc_frame_flags_t enc_flags,
-                                   unsigned long deadline) {
+                                   vpx_enc_deadline_t deadline) {
   volatile vpx_codec_err_t res = VPX_CODEC_OK;
   // Make a copy as volatile to avoid -Wclobbered with longjmp.
   volatile vpx_enc_frame_flags_t flags = enc_flags;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b75fc18fd2..568a21c2a1 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1153,7 +1153,7 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
 
 static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long duration,
-                                    unsigned long deadline) {
+                                    vpx_enc_deadline_t deadline) {
   MODE new_mode = BEST;
 
 #if CONFIG_REALTIME_ONLY
@@ -1298,7 +1298,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_pts_t pts_val,
                                       unsigned long duration,
                                       vpx_enc_frame_flags_t enc_flags,
-                                      unsigned long deadline) {
+                                      vpx_enc_deadline_t deadline) {
   volatile vpx_codec_err_t res = VPX_CODEC_OK;
   volatile vpx_enc_frame_flags_t flags = enc_flags;
   volatile vpx_codec_pts_t pts = pts_val;
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index aae3218738..60f90aa111 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -255,7 +255,7 @@ typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  vpx_codec_pts_t pts,
                                                  unsigned long duration,
                                                  vpx_enc_frame_flags_t flags,
-                                                 unsigned long deadline);
+                                                 vpx_enc_deadline_t deadline);
 typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(
     vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter);
 
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 0d6e48015a..017525aeee 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -191,7 +191,7 @@ static void FLOATING_POINT_RESTORE() {}
 vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
                                  vpx_codec_pts_t pts, unsigned long duration,
                                  vpx_enc_frame_flags_t flags,
-                                 unsigned long deadline) {
+                                 vpx_enc_deadline_t deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (!ctx || (img && !duration))
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index c45d1a2ba5..efb1be6f12 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -979,6 +979,12 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx,
  */
 vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
 
+/*!\brief Encode Deadline
+ *
+ * This type indicates a deadline, in microseconds, to be passed to
+ * vpx_codec_encode().
+ */
+typedef unsigned long vpx_enc_deadline_t;
 /*!\brief deadline parameter analogous to VPx REALTIME mode. */
 #define VPX_DL_REALTIME (1)
 /*!\brief deadline parameter analogous to  VPx GOOD QUALITY mode. */
@@ -1024,7 +1030,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
 vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
                                  vpx_codec_pts_t pts, unsigned long duration,
                                  vpx_enc_frame_flags_t flags,
-                                 unsigned long deadline);
+                                 vpx_enc_deadline_t deadline);
 
 /*!\brief Set compressed data output buffer
  *

From a33ac12dc0e6d99ee47f30edd37c4ca5e71b5b1d Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 29 Nov 2023 15:50:56 +0000
Subject: [PATCH 894/926] Specialise Armv8.0 Neon vert convolution for 4-tap
 filters

Add an Armv8.0 MLA Neon implementation of vertical convolution
specialised for executing with 4-tap filters (the most common filter
size for settings --good --cpu-used=1.) This new path is also used
when executing with bilinear (2-tap) filters.

Change-Id: I027eaf2d1bb9711c2217cc8aa6b1e379d3e66b26
---
 vpx_dsp/arm/mem_neon.h           |  10 ++
 vpx_dsp/arm/vpx_convolve8_neon.c | 161 +++++++++++++++++++++++++++----
 vpx_dsp/arm/vpx_convolve8_neon.h |  21 ++++
 vpx_dsp/arm/vpx_convolve_neon.c  |  29 +++---
 4 files changed, 187 insertions(+), 34 deletions(-)

diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 586bfb85af..38b0b6c1a9 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -273,6 +273,16 @@ static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
   vst1_u8(s, s2);
 }
 
+static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+}
+
 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3) {
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 8b89862ba9..790c8d8352 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -482,23 +482,115 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src,
+                                               ptrdiff_t src_stride,
+                                               uint8_t *dst,
+                                               ptrdiff_t dst_stride, int w,
+                                               int h, const int16x4_t filter) {
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(y_step_q4 == 16);
+    load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
 
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
+    src += 3 * src_stride;
 
-  src -= 3 * src_stride;
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+
+      d0 = convolve4_4(s0, s1, s2, s3, filter);
+      d1 = convolve4_4(s1, s2, s3, s4, filter);
+      d2 = convolve4_4(s2, s3, s4, s5, filter);
+      d3 = convolve4_4(s3, s4, s5, s6, filter);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+    uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6;
+
+    do {
+      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+      s = src + 3 * src_stride;
+      d = dst;
+      height = h;
+
+      do {
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
+
+        d0 = convolve4_8(s0, s1, s2, s3, filter);
+        d1 = convolve4_8(s1, s2, s3, s4, filter);
+        d2 = convolve4_8(s2, s3, s4, s5, filter);
+        d3 = convolve4_8(s3, s4, s5, s6, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
+                                               ptrdiff_t src_stride,
+                                               uint8_t *dst,
+                                               ptrdiff_t dst_stride, int w,
+                                               int h, const int16x8_t filter) {
   if (w == 4) {
     uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
@@ -530,10 +622,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
 
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
@@ -596,10 +688,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
 
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -621,6 +713,33 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements.
+     */
+    const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1);
+    vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride,
+                                w, h, y_filter_4tap);
+  } else {
+    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+    vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
+                                dst_stride, w, h, y_filter_8tap);
+  }
+}
+
 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *filter, int x0_q4,
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 031b9eb852..f01d4f6a42 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -351,6 +351,27 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
 
 #endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
+static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t filters) {
+  int16x4_t sum = vmul_lane_s16(s0, filters, 0);
+  sum = vmla_lane_s16(sum, s1, filters, 1);
+  sum = vmla_lane_s16(sum, s2, filters, 2);
+  sum = vmla_lane_s16(sum, s3, filters, 3);
+  return sum;
+}
+
+static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x4_t filters) {
+  int16x8_t sum = vmulq_lane_s16(s0, filters, 0);
+  sum = vmlaq_lane_s16(sum, s1, filters, 1);
+  sum = vmlaq_lane_s16(sum, s2, filters, 2);
+  sum = vmlaq_lane_s16(sum, s3, filters, 3);
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 830f3176d7..57772ea668 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -12,35 +12,38 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
    */
   uint8_t temp[64 * 72];
 
-  // Account for the vertical phase needing 3 lines prior and 4 lines post
-  // (+ 1 to make it divisible by 4).
-  const int intermediate_height = h + 8;
+  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+   * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */
+  const int intermediate_height = h + vert_filter_taps;
+  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting 3 lines back. The neon implementation will ignore the given
-   * height and filter a multiple of 4 lines. Since this goes in to the temp
-   * buffer which has lots of extra room and is subsequently discarded this is
-   * safe if somewhat less than ideal.   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
-                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+  /* Filter starting border_offset lines back. The Neon implementation will
+   * ignore the given height and filter a multiple of 4 lines. Since this goes
+   * in to the temp buffer which has lots of extra room and is subsequently
+   * discarded this is safe if somewhat less than ideal.   */
+  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp,
+                           w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                            intermediate_height);
 
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
-                          x_step_q4, y0_q4, y_step_q4, w, h);
+  /* Step into the temp buffer border_offset lines to get actual frame data. */
+  vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter,
+                          x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,

From 15c2a9a02fa9049430c010c9dea2339440a03add Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 30 Nov 2023 15:37:12 -0800
Subject: [PATCH 895/926] Add a test for b/312517065

Bug: b/312517065
Change-Id: I6b5529a8e034fb0468f110e420fafb4944a19d0f
---
 test/encode_api_test.cc | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index da3676f6bf..7ca2b5941f 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -601,7 +601,7 @@ TEST(EncodeAPI, PrevMiCheckNullptr) {
   encoder.Encode(false);
 }
 
-// This is a test case from clusterfuzz: based on 310477034.
+// This is a test case from clusterfuzz: based on b/310477034.
 // Encode a few frames with multiple change config call
 // with different frame size.
 TEST(EncodeAPI, MultipleChangeConfigResize) {
@@ -673,6 +673,18 @@ TEST(EncodeAPI, DynamicDeadlineChange) {
   encoder.Encode(false);
 }
 
+// This is a test case from clusterfuzz: based on b/312517065.
+TEST(EncodeAPI, Buganizer312517065) {
+  VP9Encoder encoder(4);
+  encoder.Configure(0, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Configure(10, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Encode(false);
+  encoder.Configure(6, 327, 269, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(false);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {

From d144e6e95d7cb11a0161ddf2b141a0813f0b665e Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 30 Nov 2023 15:02:47 +0000
Subject: [PATCH 896/926] Specialise Armv8.0 Neon horiz convolution for 4-tap
 filters

Add an Armv8.0 MLA Neon implementation of horizontal convolution
specialised for executing with 4-tap filters (the most common filter
size for settings --good --cpu-used=1.) This new path is also used
when executing with bilinear (2-tap) filters.

Change-Id: Ic2c3cb307b95964cd0ba86f1c42eece3a8ab7cf4
---
 vpx_dsp/arm/vpx_convolve8_neon.c | 162 ++++++++++++++++++++++---------
 1 file changed, 118 insertions(+), 44 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 790c8d8352..65fb67c984 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -32,33 +32,88 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *filter, int x0_q4,
-                              int x_step_q4, int y0_q4, int y_step_q4, int w,
-                              int h) {
-  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
-  uint8x8_t t0, t1, t2, t3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
+static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
+                                                ptrdiff_t src_stride,
+                                                uint8_t *dst,
+                                                ptrdiff_t dst_stride, int w,
+                                                int h, const int16x4_t filter) {
+  if (w == 4) {
+    do {
+      int16x4_t s0[4], s1[4];
+
+      int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src)));
+      s0[0] = vget_low_s16(vextq_s16(t0, t0, 0));
+      s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
+      s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
+      s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
+
+      int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride)));
+      s1[0] = vget_low_s16(vextq_s16(t1, t1, 0));
+      s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
+      s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
+      s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
+
+      int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
+      int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+
+      store_u8(dst, dst_stride, d01);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
 
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
+      do {
+        int16x8_t t0[2], t1[2];
+        int16x8_t s0[4], s1[4];
+
+        t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8)));
+        s0[0] = vextq_s16(t0[0], t0[1], 0);
+        s0[1] = vextq_s16(t0[0], t0[1], 1);
+        s0[2] = vextq_s16(t0[0], t0[1], 2);
+        s0[3] = vextq_s16(t0[0], t0[1], 3);
+
+        t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride)));
+        t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8)));
+        s1[0] = vextq_s16(t1[0], t1[1], 0);
+        s1[1] = vextq_s16(t1[0], t1[1], 1);
+        s1[2] = vextq_s16(t1[0], t1[1], 2);
+        s1[3] = vextq_s16(t1[0], t1[1], 3);
+
+        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
+        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
+
+        vst1_u8(d, d0);
+        vst1_u8(d + dst_stride, d1);
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
 
-  src -= 3;
+static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
+                                                ptrdiff_t src_stride,
+                                                uint8_t *dst,
+                                                ptrdiff_t dst_stride, int w,
+                                                int h, const int16x8_t filter) {
+  uint8x8_t t0, t1, t2, t3;
 
   if (h == 4) {
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
-    __builtin_prefetch(src + 0 * src_stride);
-    __builtin_prefetch(src + 1 * src_stride);
-    __builtin_prefetch(src + 2 * src_stride);
-    __builtin_prefetch(src + 3 * src_stride);
-
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
@@ -83,10 +138,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
@@ -149,10 +204,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
         transpose_u8_8x4(&d04, &d15, &d26, &d37);
 
@@ -170,14 +225,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       int16x8_t s11, s12, s13, s14;
 
       do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -212,14 +259,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
 
           transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
@@ -244,6 +291,33 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements.
+     */
+    const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1);
+    vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+                                 x_filter_4tap);
+  } else {
+    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+    vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+                                 x_filter_8tap);
+  }
+}
+
 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *filter, int x0_q4,

From 845a817c056c05e8fe7ae9298be47b949d8aceee Mon Sep 17 00:00:00 2001
From: Bohan Li <bohanli@google.com>
Date: Thu, 30 Nov 2023 15:49:38 -0800
Subject: [PATCH 897/926] Fix scaled reference offsets.

Since the reference frame is already scaled, do not scale the offsets.

BUG: b/311489136, b/312656387
Change-Id: Ib346242e7ec8c4d3ed26668fa4094271218278ed
---
 test/encode_api_test.cc       | 59 +++++++++++++++++++++++++++++++++++
 vp9/encoder/vp9_encodeframe.c |  3 +-
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 7ca2b5941f..6205a56ce0 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -685,6 +685,65 @@ TEST(EncodeAPI, Buganizer312517065) {
   encoder.Encode(false);
 }
 
+// This is a test case from clusterfuzz: based on b/311489136.
+// Encode a few frames with multiple change config call
+// with different frame size.
+TEST(EncodeAPI, Buganizer311489136) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(3, 1678, 202, VPX_CBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config again.
+  encoder.Configure(8, 1037, 476, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config again.
+  encoder.Configure(0, 580, 620, VPX_CBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 4th frame with same config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/312656387.
+// Encode a few frames with multiple change config call
+// with different frame size.
+TEST(EncodeAPI, Buganizer312656387) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(16, 1, 1024, VPX_CBR, VPX_DL_REALTIME);
+
+  // Change config.
+  encoder.Configure(15, 1, 1024, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config again.
+  encoder.Configure(14, 1, 595, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(true);
+
+  // Change config again.
+  encoder.Configure(2, 1, 1024, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 3rd frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b98fd84579..ddf47c1288 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3451,8 +3451,7 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
 
   assert(yv12 != NULL);
   if (!yv12) return;
-  vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                       &cm->frame_refs[ref - 1].sf);
+  vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL);
   mi->ref_frame[0] = ref;
   mi->ref_frame[1] = NO_REF_FRAME;
   mi->sb_type = bsize;

From 070d7e5cf339845b0c280687697f7e8dd0444098 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 30 Nov 2023 10:08:18 -0800
Subject: [PATCH 898/926] Document vpx_codec_decode() ignores deadline param

The changes in this CL show that both the VP8 and VP9 implementations of
the decode function eventually discard the deadline parameter. Change
the code to ignore the deadline parameter in vpx_codec_decode() without
passing it to the decode function, and document that the deadline
parameter is ignored and 0 should be passed.

Change-Id: Ia977e16cdbdf97901207aa2d749887980137c4c0
---
 vp8/common/onyxd.h                |  3 +--
 vp8/decoder/onyxd_if.c            |  6 +-----
 vp8/decoder/onyxd_int.h           |  1 -
 vp8/vp8_dx_iface.c                |  8 +++-----
 vp9/vp9_dx_iface.c                | 10 ++++------
 vpx/internal/vpx_codec_internal.h |  3 +--
 vpx/src/vpx_decoder.c             |  7 +++----
 vpx/vpx_decoder.h                 |  2 ++
 8 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h
index e4e81aaac5..217a598de7 100644
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -41,9 +41,8 @@ void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x);
 
 int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst);
 
-int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi, int64_t time_stamp);
+int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi);
 int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
                         vp8_ppflags_t *flags);
 int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
 
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 765d2ec83e..2248345ba2 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -302,7 +302,7 @@ static int check_fragments_for_errors(VP8D_COMP *pbi) {
   return 1;
 }
 
-int vp8dx_receive_compressed_data(VP8D_COMP *pbi, int64_t time_stamp) {
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi) {
   VP8_COMMON *cm = &pbi->common;
   int retcode = -1;
 
@@ -368,14 +368,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, int64_t time_stamp) {
 #endif
 
   pbi->ready_for_new_data = 0;
-  pbi->last_time_stamp = time_stamp;
 
 decode_exit:
   vpx_clear_system_state();
   return retcode;
 }
 int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
                         vp8_ppflags_t *flags) {
   int ret = -1;
 
@@ -385,8 +383,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
   if (pbi->common.show_frame == 0) return ret;
 
   pbi->ready_for_new_data = 1;
-  *time_stamp = pbi->last_time_stamp;
-  *time_end_stamp = 0;
 
 #if CONFIG_POSTPROC
   ret = vp8_post_proc_frame(&pbi->common, sd, flags);
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 56500a8506..1070849620 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -99,7 +99,6 @@ typedef struct VP8D_COMP {
 /* end of threading data */
 #endif
 
-  int64_t last_time_stamp;
   int ready_for_new_data;
 
   vp8_prob prob_intra;
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 2e5d6dcfe8..e81deaf4ea 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -276,7 +276,7 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
 
 static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t *data, unsigned int data_sz,
-                                  void *user_priv, long deadline) {
+                                  void *user_priv) {
   volatile vpx_codec_err_t res;
   volatile unsigned int resolution_change = 0;
   volatile unsigned int w, h;
@@ -508,7 +508,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
     pbi->restart_threads = 0;
 #endif
     ctx->user_priv = user_priv;
-    if (vp8dx_receive_compressed_data(pbi, deadline)) {
+    if (vp8dx_receive_compressed_data(pbi)) {
       res = update_error_state(ctx, &pbi->common.error);
     }
 
@@ -529,7 +529,6 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
    */
   if (!(*iter) && ctx->yv12_frame_buffers.pbi[0]) {
     YV12_BUFFER_CONFIG sd;
-    int64_t time_stamp = 0, time_end_stamp = 0;
     vp8_ppflags_t flags;
     vp8_zero(flags);
 
@@ -539,8 +538,7 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
       flags.noise_level = ctx->postproc_cfg.noise_level;
     }
 
-    if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd,
-                                 &time_stamp, &time_end_stamp, &flags)) {
+    if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, &flags)) {
       yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
 
       img = &ctx->img;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index a242c776cd..860f721dc5 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -305,9 +305,7 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
 
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
-                                  void *user_priv, int64_t deadline) {
-  (void)deadline;
-
+                                  void *user_priv) {
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -342,7 +340,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
 
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
-                                      void *user_priv, long deadline) {
+                                      void *user_priv) {
   const uint8_t *data_start = data;
   vpx_codec_err_t res;
   uint32_t frame_sizes[8];
@@ -382,7 +380,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
         return VPX_CODEC_CORRUPT_FRAME;
       }
 
-      res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
+      res = decode_one(ctx, &data_start_copy, frame_size, user_priv);
       if (res != VPX_CODEC_OK) return res;
 
       data_start += frame_size;
@@ -391,7 +389,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
     const uint8_t *const data_end = data + data_sz;
     while (data_start < data_end) {
       const uint32_t frame_size = (uint32_t)(data_end - data_start);
-      res = decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+      res = decode_one(ctx, &data_start, frame_size, user_priv);
       if (res != VPX_CODEC_OK) return res;
 
       // Account for suboptimal termination by the encoder.
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index aae3218738..99c1b3d110 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -198,8 +198,7 @@ typedef const struct vpx_codec_ctrl_fn_map {
 typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  const uint8_t *data,
                                                  unsigned int data_sz,
-                                                 void *user_priv,
-                                                 long deadline);
+                                                 void *user_priv);
 
 /*!\brief Decoded frames iterator
  *
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 427cd1bf43..c79cc708cd 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -105,6 +105,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
                                  unsigned int data_sz, void *user_priv,
                                  long deadline) {
   vpx_codec_err_t res;
+  (void)deadline;
 
   /* Sanity checks */
   /* NULL data ptr allowed if data_sz is 0 too */
@@ -112,10 +113,8 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
     res = VPX_CODEC_INVALID_PARAM;
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
-  else {
-    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
-                                 deadline);
-  }
+  else
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
 
   return SAVE_STATUS(ctx, res);
 }
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 99dd8cf694..3242692e7e 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -205,6 +205,8 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
  *                         this frame.
  * \param[in] deadline     Soft deadline the decoder should attempt to meet,
  *                         in us. Set to zero for unlimited.
+ *                         NOTE: The deadline parameter is ignored. Always
+ *                         pass 0.
  *
  * \return Returns #VPX_CODEC_OK if the coded data was processed completely
  *         and future pictures can be decoded without error. Otherwise,

From 5cad6fdc9280ee656e91905250e7675e694a1c24 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 1 Dec 2023 12:13:01 -0800
Subject: [PATCH 899/926] CHANGELOG: add CVE for issue #1642

CVE-2023-6349 was reserved for this issue. It's not yet published.

Bug: webm:1642, b:302710624
Change-Id: Iaab2a0bcae449a45e35678f5c049413fe0a4d2a4
---
 CHANGELOG | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index 5a8605a73d..21070785ed 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,7 +11,7 @@
 
   - Bug fixes:
     https://crbug.com/1486441 (CVE-2023-5217)
-    Fix to a crash related to VP9 encoding (#1642)
+    Fix to a crash related to VP9 encoding (#1642, CVE-2023-6349)
 
 2023-01-31 v1.13.0 "Ugly Duckling"
   This release includes more Neon and AVX2 optimizations, adds a new codec

From a9f1bfdb8e93a742da9a14d4a9d3b1d847edd70d Mon Sep 17 00:00:00 2001
From: Bohan Li <bohanli@google.com>
Date: Thu, 30 Nov 2023 16:18:25 -0800
Subject: [PATCH 900/926] Fix edge case when downsizing to one.

BUG: b/310329177
Change-Id: I2ebf4165adbc7351d6cc73554827812dedc4d362
---
 test/encode_api_test.cc  | 19 +++++++++++++++++++
 vp9/encoder/vp9_resize.c |  6 ++++++
 2 files changed, 25 insertions(+)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 6205a56ce0..030b376949 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -744,6 +744,25 @@ TEST(EncodeAPI, Buganizer312656387) {
   encoder.Encode(false);
 }
 
+// This is a test case from clusterfuzz: based on b/310329177.
+// Encode a few frames with multiple change config call
+// with different frame size.
+TEST(EncodeAPI, Buganizer310329177) {
+  VP9Encoder encoder(6);
+
+  // Set initial config.
+  encoder.Configure(10, 41, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 7486dee25b..ca55ec9886 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -360,6 +360,12 @@ static int get_down2_steps(int in_length, int out_length) {
   while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
     ++steps;
     in_length = proj_in_length;
+    if (in_length == 1) {
+      // Special case: we break because any further calls to get_down2_length()
+      // with be with length == 1, which return 1, resulting in an infinite
+      // loop.
+      break;
+    }
   }
   return steps;
 }

From bf0755418357237f6ea4794dfab3c474d06a0937 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 30 Nov 2023 14:27:06 -0800
Subject: [PATCH 901/926] Add the needed Android API level predicates.

fseeko and ftello are available on Android only from API level 24. Add
the needed guards for these functions.

Suggested by Yifan Yang.

Change-Id: I3a6721d31e1d961ab10b434ea6e92959bd5a70ab
---
 tools_common.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools_common.h b/tools_common.h
index 9850907c15..e2942d04b8 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -32,7 +32,12 @@ typedef int64_t FileOffset;
 #define fseeko fseeko64
 #define ftello ftello64
 typedef off64_t FileOffset;
-#elif CONFIG_OS_SUPPORT
+#elif CONFIG_OS_SUPPORT &&                                                  \
+    !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+      defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+ * Android API level 24. See
+ * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */
 #include <sys/types.h> /* NOLINT */
 typedef off_t FileOffset;
 /* Use 32-bit file operations in WebM file format when building ARM

From 5dcb4c17402ddae13afe5cb115bad09935fdd3d1 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 29 Nov 2023 12:42:38 -0800
Subject: [PATCH 902/926] Define VPX_DL_* macros as unsigned long constants

Define the VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, and VPX_DL_BEST_QUALITY
macros as unsigned long, because the deadline parameter of
vpx_codec_encode() is of the unsigned long type. This enables C++
templates to deduce the unsigned long type from these macros.

Change-Id: I2173e3bbf5e15c84c11843790df93a497a35ed7d
---
 vpx/vpx_encoder.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index efb1be6f12..f3799d674c 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -986,11 +986,11 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
  */
 typedef unsigned long vpx_enc_deadline_t;
 /*!\brief deadline parameter analogous to VPx REALTIME mode. */
-#define VPX_DL_REALTIME (1)
+#define VPX_DL_REALTIME 1ul
 /*!\brief deadline parameter analogous to  VPx GOOD QUALITY mode. */
-#define VPX_DL_GOOD_QUALITY (1000000)
+#define VPX_DL_GOOD_QUALITY 1000000ul
 /*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */
-#define VPX_DL_BEST_QUALITY (0)
+#define VPX_DL_BEST_QUALITY 0ul
 /*!\brief Encode a frame
  *
  * Encodes a video frame at the given "presentation time." The presentation

From f10481dc0a49b5d53428560de2a2eee43f9ed60f Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Wed, 22 Nov 2023 17:09:14 -0800
Subject: [PATCH 903/926] Set skip_recode=0 in nonrd_pick_sb_modes

Need to set skip_recode properly so that
vp9_encode_block_intra() can work properly when it is
called by block_rd_txfm(). We can not skip "recode" because
it is still at the rd search stage.

Bug: b/310340241
Change-Id: I7d7600ef72addd341636549c2dad1868ad90e1cb
---
 test/encode_api_test.cc       | 19 +++++++++++++++++++
 vp9/encoder/vp9_encodeframe.c |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 785875c229..4ef83d75b3 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -673,6 +673,25 @@ TEST(EncodeAPI, DynamicDeadlineChange) {
   encoder.Encode(false);
 }
 
+TEST(EncodeAPI, Buganizer310340241) {
+  VP9Encoder encoder(-6);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(true);
+
+  // Encode 2nd frame, delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set key frame.
+  encoder.Encode(true);
+}
+
 // This is a test case from clusterfuzz: based on b/312517065.
 TEST(EncodeAPI, Buganizer312517065) {
   VP9Encoder encoder(4);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ddf47c1288..d6ac04400d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -4714,6 +4714,8 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
 
+  x->skip_recode = 0;
+
   mi = xd->mi[0];
   mi->sb_type = bsize;
 

From db83435afbeee1a31a9e6cdeca38407ebc724bc8 Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Tue, 28 Nov 2023 11:32:42 +0100
Subject: [PATCH 904/926] configure: Add darwin23 support

Add target arm64-darwin23-gcc, x86_64-darwin23-gcc for MacOS 14.

Change-Id: I6b68a6a61d51aaa78ec11a5055bb95ce77a81d9c
---
 build/make/configure.sh | 2 +-
 configure               | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 54fb1daf4d..869793a296 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -792,7 +792,7 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
         ;;
-      *darwin2[0-2]*)
+      *darwin2[0-3]*)
         tgt_isa=`uname -m`
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
         ;;
diff --git a/configure b/configure
index 6b910160a8..b212e0709d 100755
--- a/configure
+++ b/configure
@@ -102,6 +102,7 @@ all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} arm64-darwin20-gcc"
 all_platforms="${all_platforms} arm64-darwin21-gcc"
 all_platforms="${all_platforms} arm64-darwin22-gcc"
+all_platforms="${all_platforms} arm64-darwin23-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
@@ -165,6 +166,7 @@ all_platforms="${all_platforms} x86_64-darwin19-gcc"
 all_platforms="${all_platforms} x86_64-darwin20-gcc"
 all_platforms="${all_platforms} x86_64-darwin21-gcc"
 all_platforms="${all_platforms} x86_64-darwin22-gcc"
+all_platforms="${all_platforms} x86_64-darwin23-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"

From 9ad598f249ee5af0ad211797063fba6de8d1ff80 Mon Sep 17 00:00:00 2001
From: Bohan Li <bohanli@google.com>
Date: Mon, 4 Dec 2023 13:12:46 -0800
Subject: [PATCH 905/926] Improve test comments.

Change-Id: I42dddb946193e30cf07e39b43eaad051c5da479a
---
 test/encode_api_test.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 4ef83d75b3..65501ee8ca 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -602,8 +602,8 @@ TEST(EncodeAPI, PrevMiCheckNullptr) {
 }
 
 // This is a test case from clusterfuzz: based on b/310477034.
-// Encode a few frames with multiple change config call
-// with different frame size.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
 TEST(EncodeAPI, MultipleChangeConfigResize) {
   VP9Encoder encoder(3);
 
@@ -705,8 +705,8 @@ TEST(EncodeAPI, Buganizer312517065) {
 }
 
 // This is a test case from clusterfuzz: based on b/311489136.
-// Encode a few frames with multiple change config call
-// with different frame size.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
 TEST(EncodeAPI, Buganizer311489136) {
   VP9Encoder encoder(1);
 
@@ -736,8 +736,8 @@ TEST(EncodeAPI, Buganizer311489136) {
 }
 
 // This is a test case from clusterfuzz: based on b/312656387.
-// Encode a few frames with multiple change config call
-// with different frame size.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
 TEST(EncodeAPI, Buganizer312656387) {
   VP9Encoder encoder(1);
 
@@ -764,8 +764,8 @@ TEST(EncodeAPI, Buganizer312656387) {
 }
 
 // This is a test case from clusterfuzz: based on b/310329177.
-// Encode a few frames with multiple change config call
-// with different frame size.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
 TEST(EncodeAPI, Buganizer310329177) {
   VP9Encoder encoder(6);
 

From 8bf3649d410cd68076e532e697f34dcec3f87ce7 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Mon, 4 Dec 2023 17:32:51 -0800
Subject: [PATCH 906/926] Fix a bug in frame scaling

This change fixed a corner case bug reealed by b/311394513.
During the frame scaling, vpx_highbd_convolve8() and vpx_scaled_2d()
requires both x_step_q4 and y_step_q4 are less than or equal to a
defined value. Otherwise, it needs to call vp9_scale_and_extend_
frame_nonnormative() that supports arbitrary scaling.

The fix was done in LBD and HBD funnctions.

Bug: b/311394513
Change-Id: Id0d34e7910ec98859030ef968ac19331488046d4
---
 test/encode_api_test.cc       | 19 +++++++++++++++++++
 vp9/encoder/vp9_encoder.c     | 31 ++++++++++++++++++++++++-------
 vp9/encoder/vp9_encoder.h     |  8 ++++++++
 vp9/encoder/vp9_frame_scale.c | 18 ++++++++++++++++++
 4 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 65501ee8ca..f046a9db39 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -782,6 +782,25 @@ TEST(EncodeAPI, Buganizer310329177) {
   encoder.Encode(false);
 }
 
+// This is a test case from clusterfuzz: based on b/311394513.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311394513) {
+  VP9Encoder encoder(-7);
+
+  // Set initial config.
+  encoder.Configure(0, 5, 9, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(5, 2, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(true);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index be55150140..20e35077cd 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3079,12 +3079,11 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst,
-                                                int bd) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst, int bd) {
 #else
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
@@ -3129,6 +3128,23 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
+
+  // The issue b/311394513 reveals a corner case bug.
+  // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less
+  // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both
+  // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition
+  // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that
+  // supports arbitrary scaling.
+  const int x_step_q4 = 16 * src_w / dst_w;
+  const int y_step_q4 = 16 * src_h / dst_h;
+  const int is_arbitrary_scaling =
+      (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) ||
+      (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32));
+  if (is_arbitrary_scaling) {
+    vp9_scale_and_extend_frame_nonnormative(src, dst, bd);
+    return;
+  }
+
   const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
                                    src->v_buffer };
   const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
@@ -4993,13 +5009,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
         scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
                                filter_type, phase_scaler);
     else
-      scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+      vp9_scale_and_extend_frame_nonnormative(unscaled, scaled,
+                                              (int)cm->bit_depth);
 #else
     if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
         unscaled->y_height <= (scaled->y_height << 1))
       vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
     else
-      scale_and_extend_frame_nonnormative(unscaled, scaled);
+      vp9_scale_and_extend_frame_nonnormative(unscaled, scaled);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     return scaled;
   } else {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 160de0064f..83b7081e7b 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -1382,6 +1382,14 @@ void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst, int bd);
+#else
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
     VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c
index a410d0407f..22b3f05579 100644
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -12,6 +12,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_scale/yv12config.h"
 
@@ -91,6 +92,23 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
   {
     const int dst_w = dst->y_crop_width;
     const int dst_h = dst->y_crop_height;
+
+    // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires
+    // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it
+    // needs to call vp9_scale_and_extend_frame_nonnormative() that supports
+    // arbitrary scaling.
+    const int x_step_q4 = 16 * src_w / dst_w;
+    const int y_step_q4 = 16 * src_h / dst_h;
+    if (x_step_q4 > 64 || y_step_q4 > 64) {
+      // This funnction is only called while cm->bit_depth is VPX_BITS_8.
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8);
+#else
+      vp9_scale_and_extend_frame_nonnormative(src, dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      return;
+    }
+
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       const int factor = (i == 0 || i == 3 ? 1 : 2);
       const int src_stride = src_strides[i];

From 7cfc58de4894153b20a144ed142956ac2f2e7aa6 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 5 Dec 2023 13:45:05 -0500
Subject: [PATCH 907/926] RTC RC: add screen content support for vp8

Bug: b/281463780
Change-Id: I446c00bf8d794aa9134e4fe37960dd8a465448a4
---
 test/vp8_ratectrl_rtc_test.cc | 21 +++++++++++++++++++++
 vp8/vp8_ratectrl_rtc.cc       |  2 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index 9fbc1d4d98..50478f7635 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -45,6 +45,7 @@ struct Vp8RCTestVideo {
 const Vp8RCTestVideo kVp8RCTestVectors[] = {
   Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470),
   Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300),
+  Vp8RCTestVideo("hantro_collage_w352h288.yuv", 352, 288, 100),
 };
 
 class Vp8RcInterfaceTest
@@ -128,6 +129,9 @@ class Vp8RcInterfaceTest
         encoder->Control(VP8E_SET_CPUUSED, -6);
         encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
         encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+        if (rc_cfg_.is_screen) {
+          encoder->Control(VP8E_SET_SCREEN_CONTENT_MODE, 1);
+        }
       } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
         // Disable golden frame update.
         frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
@@ -171,6 +175,21 @@ class Vp8RcInterfaceTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunOneLayerScreen() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfig();
+    rc_cfg_.is_screen = true;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
   void RunOneLayerDropFrames() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
@@ -377,6 +396,8 @@ class Vp8RcInterfaceTest
 
 TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
+TEST_P(Vp8RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
 TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); }
 
 TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index dd3c8e623b..261c316fd1 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -141,7 +141,7 @@ bool VP8RateControlRTC::UpdateRateControl(
     cpi_->prior_key_frame_distance[i] =
         static_cast<int>(cpi_->output_framerate);
   }
-
+  oxcf->screen_content_mode = rc_cfg.is_screen;
   if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) {
     memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
            sizeof(rc_cfg.layer_target_bitrate));

From 97184161d5c93e5e69d0de6b064b005e5c82d342 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 5 Dec 2023 10:55:16 -0800
Subject: [PATCH 908/926] Add "IWYU pragma: export" to some public headers

vpx/vpx_integer.h is clearly intended as the facade header for the
Standard C Library headers <stddef.h>, <inttypes.h>, and <stdint.h>.

It is reasonable to expect that vpx/vpx_decoder.h and vpx/vpx_encoder.h
should provide the symbols from vpx/vpx_codec.h.

Change-Id: I220797e63b2efc3dd9e2ac197fe2f918bf80d247
---
 vpx/vpx_decoder.h | 2 +-
 vpx/vpx_encoder.h | 2 +-
 vpx/vpx_integer.h | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 3242692e7e..0e9611e31f 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -29,7 +29,7 @@
 extern "C" {
 #endif
 
-#include "./vpx_codec.h"
+#include "./vpx_codec.h"  // IWYU pragma: export
 #include "./vpx_frame_buffer.h"
 
 /*!\brief Current ABI version number
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index f3799d674c..18e3862bd7 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -29,7 +29,7 @@
 extern "C" {
 #endif
 
-#include "./vpx_codec.h"
+#include "./vpx_codec.h"  // IWYU pragma: export
 #include "./vpx_ext_ratectrl.h"
 #include "./vpx_tpl.h"
 
diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h
index 4129d156f8..34e3796411 100644
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -12,7 +12,7 @@
 #define VPX_VPX_VPX_INTEGER_H_
 
 /* get ptrdiff_t, size_t, wchar_t, NULL */
-#include <stddef.h>
+#include <stddef.h>  // IWYU pragma: export
 
 #if defined(_MSC_VER)
 #define VPX_FORCE_INLINE __forceinline
@@ -34,7 +34,7 @@
 #endif
 #endif  // __cplusplus
 
-#include <inttypes.h>
-#include <stdint.h>
+#include <inttypes.h>  // IWYU pragma: export
+#include <stdint.h>    // IWYU pragma: export
 
 #endif  // VPX_VPX_VPX_INTEGER_H_

From 12e928cb342d1d4cc8ac3c71b26da45e4488cb88 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 5 Dec 2023 11:59:15 -0800
Subject: [PATCH 909/926] Add unittest for issue b/314857577

Bug: b/314857577

Change-Id: I591036c1ad3362023686d395adb4783c51baa62d
---
 test/encode_api_test.cc | 49 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index f046a9db39..08e7539272 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -801,6 +801,55 @@ TEST(EncodeAPI, Buganizer311394513) {
   encoder.Encode(true);
 }
 
+// This is a test case from clusterfuzz: based on b/314857577.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer314857577) {
+  VP9Encoder encoder(4);
+
+  // Set initial config.
+  encoder.Configure(12, 1060, 437, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(16, 1060, 1, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(false);
+
+  // Encode 3rd frame with new config.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(15, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 4th frame with new config.
+  encoder.Encode(true);
+
+  // Encode 5th frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(5, 327, 269, VPX_VBR, VPX_DL_REALTIME);
+
+  // Change config.
+  encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 6th frame with new config.
+  encoder.Encode(false);
+
+  // Encode 7th frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(4, 1060, 437, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 8th frame with new config.
+  encoder.Encode(false);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {

From 4c2435c33e12a72640e96262f982a9f5f5c513cd Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Tue, 5 Dec 2023 10:46:52 -0500
Subject: [PATCH 910/926] Fix several clang-tidy complaints

Change-Id: I78721d6b7ed692ad9363b5cac4e3324a3136d5b6
---
 test/encode_api_test.cc       | 2 +-
 test/sum_squares_test.cc      | 1 +
 vp9/encoder/vp9_encodeframe.c | 1 +
 vpx_dsp/arm/highbd_sse_neon.c | 1 +
 vpx_dsp/arm/sse_neon.c        | 2 ++
 vpx_dsp/sse.c                 | 1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl  | 2 +-
 vpx_dsp/x86/sse_avx2.c        | 3 ++-
 8 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 08e7539272..99f38c3af9 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -517,7 +517,7 @@ vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
 // Emulates the WebCodecs VideoEncoder interface.
 class VP9Encoder {
  public:
-  VP9Encoder(int speed) : speed_(speed) {}
+  explicit VP9Encoder(int speed) : speed_(speed) {}
   ~VP9Encoder();
 
   void Configure(unsigned int threads, unsigned int width, unsigned int height,
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 725d5eb853..d3c76a34d2 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -9,6 +9,7 @@
  */
 
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <string>
 #include <tuple>
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d6ac04400d..7ab183e28a 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -48,6 +48,7 @@
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_multi_thread.h"
diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c
index 717ad6b19a..8e389df8b8 100644
--- a/vpx_dsp/arm/highbd_sse_neon.c
+++ b/vpx_dsp/arm/highbd_sse_neon.c
@@ -9,6 +9,7 @@
  */
 
 #include <arm_neon.h>
+#include <stdint.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/sum_neon.h"
diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c
index 0b4a6e504a..75ee7e7816 100644
--- a/vpx_dsp/arm/sse_neon.c
+++ b/vpx_dsp/arm/sse_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+#include <stdint.h>
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
diff --git a/vpx_dsp/sse.c b/vpx_dsp/sse.c
index 6cb4b705f8..c9d751859d 100644
--- a/vpx_dsp/sse.c
+++ b/vpx_dsp/sse.c
@@ -19,6 +19,7 @@
 #include "./vpx_dsp_rtcd.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
                   int b_stride, int width, int height) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e9d63f6ef2..18087e25d9 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -744,7 +744,7 @@ ()
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
 
-add_proto qw/int64_t/, "vpx_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
+add_proto qw/int64_t/, "vpx_sse", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height";
 specialize qw/vpx_sse sse4_1 avx2 neon neon_dotprod/;
 
 #
diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c
index 975446775e..381c1a1e6c 100644
--- a/vpx_dsp/x86/sse_avx2.c
+++ b/vpx_dsp/x86/sse_avx2.c
@@ -8,8 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <smmintrin.h>
 #include <immintrin.h>
+#include <smmintrin.h>
+#include <stdint.h>
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"

From 7dfe343199381bddddc5eaa648e947876979b61b Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 28 Sep 2023 09:26:58 -0700
Subject: [PATCH 911/926] Use vpx_sse instead of vpx_mse to compute SSE

Use vpx_sse and vpx_highbd_sse instead of vpx_mse16x16 and
vpx_highbd_8_mse16x16 respectively to compute SSE for PSNR
calculations. This solves an issue whereby vpx_highbd_8_mse16x16
was being used to calculate SSE for 10- and 12-bit input.

This is a port of the libaom CL
https://aomedia-review.googlesource.com/c/aom/+/175063
by Jonathan Wright <jonathan.wright@arm.com>.

Bug: webm:1819
Change-Id: I37e3ac72835e67ccb44ac89a4ed16df62c2169a7
---
 vpx_dsp/psnr.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
index f0d4e927ae..4ee4130a21 100644
--- a/vpx_dsp/psnr.c
+++ b/vpx_dsp/psnr.c
@@ -45,14 +45,14 @@ static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride, int w,
-                                    int h) {
+static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride,
+                                  const uint8_t *b8, int b_stride, int w,
+                                  int h) {
   int i, j;
   int64_t sse = 0;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
@@ -88,10 +88,8 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
-    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
-      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
+      total_sse += vpx_sse(pa, a_stride, pb, b_stride, 16, 16);
 
       pa += 16;
       pb += 16;
@@ -131,21 +129,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   const int dw = width % 16;
   const int dh = height % 16;
   if (dw > 0) {
-    total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
-                                      b_stride, dw, height);
+    total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw],
+                                    b_stride, dw, height);
   }
   if (dh > 0) {
-    total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
-                                      &b[(height - dh) * b_stride], b_stride,
-                                      width - dw, dh);
+    total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride,
+                                    &b[(height - dh) * b_stride], b_stride,
+                                    width - dw, dh);
   }
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
-    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
-      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
+      total_sse += vpx_highbd_sse(pa, a_stride, pb, b_stride, 16, 16);
       pa += 16;
       pb += 16;
     }

From c4c92080545970899488ab27944792a95c7131a2 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 5 Dec 2023 14:29:37 -0800
Subject: [PATCH 912/926] Remove SSE code for 128x* blocks

The maximum block size is 64x64 in VP9.

Bug: webm:1819
Change-Id: If9802be9f81b51dbcdbc8a68d5afe48ca6d3d0e7
---
 vpx_dsp/arm/highbd_sse_neon.c  | 51 ----------------------------------
 vpx_dsp/arm/sse_neon.c         | 24 ----------------
 vpx_dsp/arm/sse_neon_dotprod.c | 26 -----------------
 vpx_dsp/x86/sse_avx2.c         | 34 -----------------------
 vpx_dsp/x86/sse_sse4.c         | 47 -------------------------------
 5 files changed, 182 deletions(-)

diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c
index 8e389df8b8..91dfebf900 100644
--- a/vpx_dsp/arm/highbd_sse_neon.c
+++ b/vpx_dsp/arm/highbd_sse_neon.c
@@ -43,55 +43,6 @@ static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
   *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
 }
 
-static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
-                                            const uint16_t *ref, int ref_stride,
-                                            int height) {
-  uint32x4_t sse[16];
-  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
-  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
-  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
-  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
-  highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
-  highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
-  highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
-  highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
-  highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
-  highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
-  highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
-  highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
-  highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
-  highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
-  highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
-  highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
-
-  src += src_stride;
-  ref += ref_stride;
-
-  while (--height != 0) {
-    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
-    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
-    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
-    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
-    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
-    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
-    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
-    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
-    highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
-    highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
-    highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
-    highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
-    highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
-    highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
-    highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
-    highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
-
-    src += src_stride;
-    ref += ref_stride;
-  }
-
-  return horizontal_long_add_uint32x4_x16(sse);
-}
-
 static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
                                            const uint16_t *ref, int ref_stride,
                                            int height) {
@@ -280,8 +231,6 @@ int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride,
       return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
     case 64:
       return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
-    case 128:
-      return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
     default:
       return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
                                  height);
diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c
index 75ee7e7816..2dd57e596c 100644
--- a/vpx_dsp/arm/sse_neon.c
+++ b/vpx_dsp/arm/sse_neon.c
@@ -86,29 +86,6 @@ static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_uint32x4(sse);
 }
 
-static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride,
-                                      int height) {
-  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = height;
-  do {
-    sse_16x1_neon(src, ref, &sse[0]);
-    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
-    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
-    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
-    sse_16x1_neon(src + 64, ref + 64, &sse[0]);
-    sse_16x1_neon(src + 80, ref + 80, &sse[1]);
-    sse_16x1_neon(src + 96, ref + 96, &sse[0]);
-    sse_16x1_neon(src + 112, ref + 112, &sse[1]);
-
-    src += src_stride;
-    ref += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
-}
-
 static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride,
                                      int height) {
@@ -205,7 +182,6 @@ int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
     case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
     case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
     case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
-    case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
     default:
       return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
   }
diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c
index 0f11b7cbb2..8777773918 100644
--- a/vpx_dsp/arm/sse_neon_dotprod.c
+++ b/vpx_dsp/arm/sse_neon_dotprod.c
@@ -85,30 +85,6 @@ static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
   return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
-                                              int src_stride,
-                                              const uint8_t *ref,
-                                              int ref_stride, int height) {
-  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = height;
-  do {
-    sse_16x1_neon_dotprod(src, ref, &sse[0]);
-    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
-    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
-    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
-    sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
-    sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
-    sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
-    sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
-
-    src += src_stride;
-    ref += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
-}
-
 static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
                                              const uint8_t *ref, int ref_stride,
                                              int height) {
@@ -214,8 +190,6 @@ int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride,
       return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
     case 64:
       return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
-    case 128:
-      return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
     default:
       return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
                                   height);
diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c
index 381c1a1e6c..dfe45b6115 100644
--- a/vpx_dsp/x86/sse_avx2.c
+++ b/vpx_dsp/x86/sse_avx2.c
@@ -169,18 +169,6 @@ int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
       } while (y < height);
       sse = summary_all_avx2(&sum);
       break;
-    case 128:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        sse_w32_avx2(&sum, a + 32, b + 32);
-        sse_w32_avx2(&sum, a + 64, b + 64);
-        sse_w32_avx2(&sum, a + 96, b + 96);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
     default:
       if ((width & 0x07) == 0) {
         do {
@@ -334,28 +322,6 @@ int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
       } while (y < height);
       sse = summary_4x64_avx2(sum);
       break;
-    case 128:
-      do {
-        int l = 0;
-        __m256i sum32 = _mm256_setzero_si256();
-        do {
-          highbd_sse_w16_avx2(&sum32, a, b);
-          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
-          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
-          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
-          highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
-          highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
-          highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
-          highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
-          a += a_stride;
-          b += b_stride;
-          l += 1;
-        } while (l < 16 && l < (height - y));
-        summary_32_avx2(&sum32, &sum);
-        y += 16;
-      } while (y < height);
-      sse = summary_4x64_avx2(sum);
-      break;
     default:
       if (width & 0x7) {
         do {
diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c
index 1c2744e2fa..4a7585c57e 100644
--- a/vpx_dsp/x86/sse_sse4.c
+++ b/vpx_dsp/x86/sse_sse4.c
@@ -128,22 +128,6 @@ int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
       } while (y < height);
       sse = summary_all_sse4(&sum);
       break;
-    case 128:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
-        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
-        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
-        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
-        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
     default:
       if (width & 0x07) {
         do {
@@ -285,37 +269,6 @@ int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
       _mm_storel_epi64((__m128i *)&sse,
                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
       break;
-    case 128:
-      do {
-        int l = 0;
-        __m128i sum32 = _mm_setzero_si128();
-        do {
-          highbd_sse_w8_sse4_1(&sum32, a, b);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
-          highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
-          a += a_stride;
-          b += b_stride;
-          l += 1;
-        } while (l < 8 && l < (height - y));
-        summary_32_sse4(&sum32, &sum);
-        y += 8;
-      } while (y < height);
-      _mm_storel_epi64((__m128i *)&sse,
-                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
-      break;
     default:
       if (width & 0x7) {
         do {

From f9b7c857683cb1d033b9d6a13e92843e6d8740a3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 4 Dec 2023 13:25:01 -0800
Subject: [PATCH 913/926] README: update target list

Change-Id: I001179ce34b2bf2350dce5f0197b6be175ab1c37
---
 README | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README b/README
index 5ccc9c3fce..4c25b15d81 100644
--- a/README
+++ b/README
@@ -66,6 +66,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     arm64-darwin20-gcc
     arm64-darwin21-gcc
     arm64-darwin22-gcc
+    arm64-darwin23-gcc
     arm64-linux-gcc
     arm64-win64-gcc
     arm64-win64-vs15
@@ -81,6 +82,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv7-win32-gcc
     armv7-win32-vs14
     armv7-win32-vs15
+    armv7-win32-vs16
+    armv7-win32-vs17
     armv7s-darwin-gcc
     armv8-linux-gcc
     loongarch32-linux-gcc
@@ -127,6 +130,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-darwin20-gcc
     x86_64-darwin21-gcc
     x86_64-darwin22-gcc
+    x86_64-darwin23-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc

From 476d02a2d206f959613afc0832da8656e26c8602 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 6 Dec 2023 10:03:51 -0800
Subject: [PATCH 914/926] Fix two clang-tidy misc-include-cleaner warnings

no header providing "CONFIG_VP9_HIGHBITDEPTH" is directly included
no header providing "VPX_BITS_8" is directly included

Change-Id: Ie6d78c79ab462501417f2b451bbe808a1fdce931
---
 vp9/encoder/vp9_frame_scale.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c
index 22b3f05579..e6c375413d 100644
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -9,10 +9,12 @@
  */
 
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/encoder/vp9_encoder.h"
+#include "vpx/vpx_codec.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_scale/yv12config.h"
 

From 2f258fdee1b2dc276d973cde6bd2f81c63f13155 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 6 Dec 2023 10:54:21 -0800
Subject: [PATCH 915/926] vp9_frame_scale.c,cosmetics: funnction -> function

Change-Id: I8ecbd52037ff096f5c84c834b193b0a34c55a8b7
---
 vp9/encoder/vp9_frame_scale.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c
index e6c375413d..c74d523246 100644
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -102,7 +102,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
     const int x_step_q4 = 16 * src_w / dst_w;
     const int y_step_q4 = 16 * src_h / dst_h;
     if (x_step_q4 > 64 || y_step_q4 > 64) {
-      // This funnction is only called while cm->bit_depth is VPX_BITS_8.
+      // This function is only called while cm->bit_depth is VPX_BITS_8.
 #if CONFIG_VP9_HIGHBITDEPTH
       vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8);
 #else

From 585798f756d60bef3761d76700f3a14e8d5d46d9 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Wed, 6 Dec 2023 10:35:18 -0500
Subject: [PATCH 916/926] Set pred buffer stride correctly

Bug: b/312875957
Change-Id: I2eb5ab86d5fe30079b3ed1cbdb8b45bb2dc72a1d
---
 test/encode_api_test.cc     | 13 +++++++++++++
 vp9/common/vp9_reconinter.c | 13 +++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 99f38c3af9..596edb229c 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -850,6 +850,19 @@ TEST(EncodeAPI, Buganizer314857577) {
   encoder.Encode(false);
 }
 
+TEST(EncodeAPI, Buganizer312875957PredBufferStride) {
+  VP9Encoder encoder(-1);
+
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Encode(false);
+  encoder.Configure(0, 456, 486, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Configure(0, 1678, 620, VPX_CBR, 1000000);
+  encoder.Encode(false);
+  encoder.Encode(false);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index ff59ff5042..4878dc15ee 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -158,18 +158,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       // Co-ordinate of containing block to pixel precision.
       const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+      const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+      uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer,
+                               ref_buf->v_buffer };
+      const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride,
+                                   ref_buf->uv_stride };
 #if 0  // CONFIG_BETTER_HW_COMPATIBILITY
       assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
              xd->mi[0]->sb_type != BLOCK_8X4);
       assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
              mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x)));
 #endif
-      if (plane == 0)
-        pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
-      else if (plane == 1)
-        pre_buf->buf = xd->block_refs[ref]->buf->u_buffer;
-      else
-        pre_buf->buf = xd->block_refs[ref]->buf->v_buffer;
+      pre_buf->buf = buf_array[plane];
+      pre_buf->stride = stride_array[plane];
 
       pre_buf->buf +=
           scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf);

From 50ed636e49db2b8fa2436413480f11ab1f2a2d1a Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Wed, 6 Dec 2023 14:24:07 -0800
Subject: [PATCH 917/926] Fix a bug in simple motion search

This change fixed a bug revealed by b/311294795.
In simple motion search, the reference buffer pointer needs to be
restored after the search. Otherwise, it causes problems while the
reference frame scaling happens. This CL fixes the bug.

Bug: b/311294795
Change-Id: I093722d5888de3cc6a6542de82a6ec9d601f897d
---
 test/encode_api_test.cc       | 31 +++++++++++++++++++++++++++++++
 vp9/encoder/vp9_encodeframe.c | 15 +++++++++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 596edb229c..5c421e3d70 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -863,6 +863,37 @@ TEST(EncodeAPI, Buganizer312875957PredBufferStride) {
   encoder.Encode(false);
 }
 
+// This is a test case from clusterfuzz: based on b/311294795
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311294795) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(16, 632, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 1678, 342, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 3rd frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(0, 1574, 618, VPX_VBR, VPX_DL_REALTIME);
+  // Encode more frames with new config.
+  encoder.Encode(false);
+  encoder.Encode(false);
+}
+
 class EncodeApiGetTplStatsTest
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 7ab183e28a..46291f4868 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3444,11 +3444,17 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 };
   MV best_mv = { 0, 0 };
   int cost_list[5];
+  struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } };
 
-  if (scaled_ref_frame)
+  if (scaled_ref_frame) {
     yv12 = scaled_ref_frame;
-  else
+    // As reported in b/311294795, the reference buffer pointer needs to be
+    // saved and restored after the search. Otherwise, it causes problems while
+    // the reference frame scaling happens.
+    for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0];
+  } else {
     yv12 = get_ref_frame_buffer(cpi, ref);
+  }
 
   assert(yv12 != NULL);
   if (!yv12) return;
@@ -3465,6 +3471,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   x->mv_limits = tmp_mv_limits;
   mi->mv[0].as_mv = best_mv;
 
+  // Restore reference buffer pointer.
+  if (scaled_ref_frame) {
+    for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i];
+  }
+
   set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
   xd->plane[0].dst.buf = pred_buf;
   xd->plane[0].dst.stride = 64;

From 1ed56a46b3f6b18e1fb89a091e60d80ae20eec01 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Mon, 4 Dec 2023 15:34:50 -0800
Subject: [PATCH 918/926] Update frame size in actual encoding

Issue explanation:
The unit test calls set_config function twice after encoding the
first frame.
The first call of set_config reduces frame width, but is still within
half of the first frame.
The second call reduces frame width even more, making is less than
half of the first frame, which according to the encoder logic,
there is no valid ref frames, and this frame should be set as a
forced keyframe. This leads to null pointer access in scale_factors
later.

Solution:
To make sure the correct detection of a forced key frame,
we need to update the frame width and height only when the actual
encoding is performed.

Bug: b/311985118

Change-Id: Ie2cd3b760d4a4b399845693d7421c4eb11a12775
---
 test/encode_api_test.cc   | 19 +++++++++++++++++++
 vp9/encoder/vp9_encoder.c |  7 +++++++
 vp9/encoder/vp9_encoder.h |  3 +++
 vp9/vp9_cx_iface.c        | 19 +++++++++++++++++--
 4 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 5c421e3d70..eb8c456de7 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -801,6 +801,25 @@ TEST(EncodeAPI, Buganizer311394513) {
   encoder.Encode(true);
 }
 
+TEST(EncodeAPI, Buganizer311985118) {
+  VP9Encoder encoder(0);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(false);
+
+  // Change config: change threads and width.
+  encoder.Configure(0, 1574, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Change config: change threads, width and height.
+  encoder.Configure(16, 837, 432, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame.
+  encoder.Encode(false);
+}
+
 // This is a test case from clusterfuzz: based on b/314857577.
 // Encode a few frames with multiple change config calls
 // with different frame sizes.
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 20e35077cd..152d42bc9a 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3888,6 +3888,7 @@ static void set_frame_size(VP9_COMP *cpi) {
   alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
+  int has_valid_ref_frame = 0;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
     const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
@@ -3906,11 +3907,17 @@ static void set_frame_size(VP9_COMP *cpi) {
                                         buf->y_crop_height, cm->width,
                                         cm->height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+      has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf);
       if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf);
     } else {
       ref_buf->buf = NULL;
     }
   }
+  if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+    vpx_internal_error(
+        &cm->error, VPX_CODEC_CORRUPT_FRAME,
+        "Can't find at least one reference frame with valid size");
+  }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 83b7081e7b..7136f7faa3 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -921,6 +921,9 @@ typedef struct VP9_COMP {
                     // number of MBs in the current frame when the frame is
                     // scaled.
 
+  int last_coded_width;
+  int last_coded_height;
+
   int use_svc;
 
   SVC svc;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 568a21c2a1..e611a6e863 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -797,10 +797,22 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
   if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
       ERROR("Cannot change width or height after initialization");
-    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+    // Note: function encoder_set_config() is allowed to be called multiple
+    // times. However, when the original frame width or height is less than two
+    // times of the new frame width or height, a forced key frame should be
+    // used. To make sure the correct detection of a forced key frame, we need
+    // to update the frame width and height only when the actual encoding is
+    // performed. cpi->last_coded_width and cpi->last_coded_height are used to
+    // track the actual coded frame size.
+    if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height &&
+         !valid_ref_frame_size(ctx->cpi->last_coded_width,
+                               ctx->cpi->last_coded_height, cfg->g_w,
+                               cfg->g_h)) ||
         (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
-        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+        (ctx->cpi->initial_height &&
+         (int)cfg->g_h > ctx->cpi->initial_height)) {
       force_key = 1;
+    }
   }
 
   // Prevent increasing lag_in_frames. This check is stricter than it needs
@@ -1310,6 +1322,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
   if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
 
+  cpi->last_coded_width = ctx->oxcf.width;
+  cpi->last_coded_height = ctx->oxcf.height;
+
   if (img != NULL) {
     res = validate_img(ctx, img);
     if (res == VPX_CODEC_OK) {

From fa60c7d9c16f0e7ce1daa2030e9920e1a64525d8 Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Thu, 7 Dec 2023 12:20:59 -0500
Subject: [PATCH 919/926] IWYU: include yv12config.h for YV12_BUFFER_CONFIG

Fix clang-tiday warning

Change-Id: Ic4d6739cb933a37168176f6b481afdfd2562acfc
---
 vp9/common/vp9_reconinter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 4878dc15ee..0a60b853d8 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -13,12 +13,13 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vpx_config.h"
 
-#include "vpx/vpx_integer.h"
-
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
+#include "vpx/vpx_integer.h"
+#include "vpx_scale/yv12config.h"
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_build_inter_predictor(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,

From 3a88c0c2046870e73f51bbd75d590e735da1f661 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 8 Dec 2023 14:39:18 -0800
Subject: [PATCH 920/926] Avoid dangling pointers in vp9_encode_free_mt_data

Set cpi->tile_thr_data and cpi->workers to NULL after freeing them.

Change-Id: I46fec5f08a6dd034c8d76828f4d546630442f216
---
 vp9/encoder/vp9_ethread.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 681996d334..a8d1cb7a7a 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -187,7 +187,9 @@ void vp9_encode_free_mt_data(struct VP9_COMP *cpi) {
     }
   }
   vpx_free(cpi->tile_thr_data);
+  cpi->tile_thr_data = NULL;
   vpx_free(cpi->workers);
+  cpi->workers = NULL;
   cpi->num_workers = 0;
 }
 

From 7e735cdf4328361169596f4ddab7bce32930c87f Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 8 Dec 2023 14:49:47 -0800
Subject: [PATCH 921/926] IWYU: include vp9_scale.h and vpx_codec.h

Fix the following clang-tidy misc-include-cleaner warnings:
vp9/encoder/vp9_encoder.c:
  no header providing "vp9_is_valid_scale" is directly included
  no header providing "VPX_CODEC_CORRUPT_FRAME" is directly included
vp9/vp9_cx_iface.c:
  no header providing "valid_ref_frame_size" is directly included

Change-Id: I20e846f5b14c42c72aaefec0718b4ae9c7eea44a
---
 vp9/encoder/vp9_encoder.c | 5 +++--
 vp9/vp9_cx_iface.c        | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 152d42bc9a..fd213f1e6b 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -18,6 +18,8 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/psnr.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
@@ -42,6 +44,7 @@
 #endif
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_tile_common.h"
 
 #if !CONFIG_REALTIME_ONLY
@@ -83,8 +86,6 @@
 #include "vp9/encoder/vp9_tpl_model.h"
 #include "vp9/vp9_cx_iface.h"
 
-#include "vpx/vpx_ext_ratectrl.h"
-
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index e611a6e863..5fa2d7c196 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -23,6 +23,7 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vpx/vp8cx.h"
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/vp9_cx_iface.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_lookahead.h"

From a75859c439cb4d4af92e4e894f7e2c43758d6cd5 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye@arm.com>
Date: Mon, 27 Nov 2023 17:55:25 +0000
Subject: [PATCH 922/926] Remove redundant comment in convolve8_4_usdot

The function convolve8_4_usdot contains a comment relating to the
SDOT implementation of convolve8, which requires addition of a
correction constant to account for range clamp of the input values.

This is not performed in the i8mm USDOT implementation - so remove the
comment.

Also add some const qualifiers to function arguments.

Change-Id: I10aff560d20403897f708ee293bf873be9c35761
---
 vpx_dsp/arm/vpx_convolve8_neon.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index f01d4f6a42..4ecaee0f99 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -116,7 +116,7 @@ static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
   return vqmovn_s32(sum);
 }
 
-static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples,
                                          const int8x8_t filters,
                                          const int32x4_t correction,
                                          const uint8x16_t range_limit,
@@ -164,7 +164,7 @@ static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples,
                                          const int8x8_t filters,
                                          const int32x4_t correction,
                                          const uint8x16_t range_limit,
@@ -281,7 +281,7 @@ static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
   return vqmovn_s32(sum);
 }
 
-static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
                                           const int8x8_t filters,
                                           const uint8x16x2_t permute_tbl) {
   uint8x16_t permuted_samples[2];
@@ -293,7 +293,6 @@ static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
   /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
   permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
 
-  /* Accumulate dot product into 'correction' to account for range clamp. */
   sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
   sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
@@ -322,7 +321,7 @@ static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
                                           const int8x8_t filters,
                                           const uint8x16x3_t permute_tbl) {
   uint8x16_t permuted_samples[3];

From 193b1511956f1732a8d54041a26ca9633a92abf9 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 11 Dec 2023 21:02:36 -0800
Subject: [PATCH 923/926] Fix to integer overflow in vp8 encodeframe.c

Unit test added.

Bug:webm:1831

Change-Id: Ib85f4f0fbdbebc0b49555f206a36376cea687df6
---
 test/encode_api_test.cc   | 32 ++++++++++++++++++++++++++++++++
 vp8/encoder/encodeframe.c | 12 ++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index eb8c456de7..928f1ede17 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -14,6 +14,7 @@
 #include <new>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -181,6 +182,37 @@ TEST(EncodeAPI, HugeFramerateVp8) {
   vpx_img_free(image);
   ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
 }
+
+// A test that reproduces https://crbug.com/webm/1831.
+TEST(EncodeAPI, RandomPixelsVp8) {
+  // Initialize libvpx encoder
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.rc_target_bitrate = 2000;
+  cfg.g_w = 1280;
+  cfg.g_h = 720;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  // Generate random frame data and encode
+  uint8_t img[1280 * 720 * 3 / 2];
+  libvpx_test::ACMRandom rng;
+  for (size_t i = 0; i < sizeof(img); ++i) {
+    img[i] = rng.Rand8();
+  }
+  vpx_image_t img_wrapper;
+  ASSERT_EQ(
+      vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img),
+      &img_wrapper);
+  ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY),
+            VPX_CODEC_OK);
+
+  // Destroy libvpx encoder
+  vpx_codec_destroy(&enc);
+}
 #endif
 
 // Set up 2 spatial streams with 2 temporal layers per stream, and generate
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 5c973940ec..82c48b13a7 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -447,13 +447,21 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
     x->active_ptr = cpi->active_map + map_index + mb_col;
 
     if (cm->frame_type == KEY_FRAME) {
-      *totalrate += vp8cx_encode_intra_macroblock(cpi, x, tp);
+      const int intra_rate_cost = vp8cx_encode_intra_macroblock(cpi, x, tp);
+      if (INT_MAX - *totalrate > intra_rate_cost)
+        *totalrate += intra_rate_cost;
+      else
+        *totalrate = INT_MAX;
 #ifdef MODE_STATS
       y_modes[xd->mbmi.mode]++;
 #endif
     } else {
-      *totalrate += vp8cx_encode_inter_macroblock(
+      const int inter_rate_cost = vp8cx_encode_inter_macroblock(
           cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+      if (INT_MAX - *totalrate > inter_rate_cost)
+        *totalrate += inter_rate_cost;
+      else
+        *totalrate = INT_MAX;
 
 #ifdef MODE_STATS
       inter_y_modes[xd->mbmi.mode]++;

From 4fe07a0c411d8ed0722c979f13248fd63ff69a73 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 13 Dec 2023 10:56:28 -0800
Subject: [PATCH 924/926] Return correct error after longjmp in vp8e_encode

After a longjmp() call in vp8e_encode(), call update_error_state() so
that we return the error code and error detail set by the
vpx_internal_error() call.

Change-Id: I1f2428eb1b1f61e46c02604e16a5d44dcf162479
---
 vp8/vp8_cx_iface.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index c42a837ebe..e7da15d16b 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -930,8 +930,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
 
     if (setjmp(ctx->cpi->common.error.jmp)) {
       ctx->cpi->common.error.setjmp = 0;
+      res = update_error_state(ctx, &ctx->cpi->common.error);
       vpx_clear_system_state();
-      return VPX_CODEC_CORRUPT_FRAME;
+      return res;
     }
     ctx->cpi->common.error.setjmp = 1;
 

From df655cf4fb6c2a23b964544acd015cc715752830 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 12 Dec 2023 16:32:29 -0800
Subject: [PATCH 925/926] Clarify the comment for update_error_state()

Explain why the encoder init functions cannot call update_error_state().

In vp8/vp8_cx_iface.c, this comment should have been added in
https://chromium-review.googlesource.com/c/webm/libvpx/+/4506609.

Rewrite update_error_state() in vp8/vp8_cx_iface.c to look like the
versions in vp9/vp9_cx_iface.c and av1/av1_cx_iface.c (in libaom).

Change-Id: I3f153d67b8c549ca5ac8ea0cfbcaad4ae705c8e6
---
 vp8/vp8_cx_iface.c | 9 ++++++---
 vp9/vp9_cx_iface.c | 4 +++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index e7da15d16b..a10e08975c 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -95,13 +95,16 @@ struct vpx_codec_alg_priv {
   vpx_enc_frame_flags_t control_frame_flags;
 };
 
+// Called by vp8e_set_config() and vp8e_encode() only. Must not be called
+// by vp8e_init() because the `error` paramerer (cpi->common.error) will be
+// destroyed by vpx_codec_enc_init_ver() after vp8e_init() returns an error.
+// See the "IMPORTANT" comment in vpx_codec_enc_init_ver().
 static vpx_codec_err_t update_error_state(
     vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
-  vpx_codec_err_t res;
+  const vpx_codec_err_t res = error->error_code;
 
-  if ((res = error->error_code)) {
+  if (res != VPX_CODEC_OK)
     ctx->base.err_detail = error->has_detail ? error->detail : NULL;
-  }
 
   return res;
 }
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5fa2d7c196..4899b1ed12 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -133,7 +133,9 @@ struct vpx_codec_alg_priv {
 };
 
 // Called by encoder_set_config() and encoder_encode() only. Must not be called
-// by encoder_init().
+// by encoder_init() because the `error` paramerer (cpi->common.error) will be
+// destroyed by vpx_codec_enc_init_ver() after encoder_init() returns an error.
+// See the "IMPORTANT" comment in vpx_codec_enc_init_ver().
 static vpx_codec_err_t update_error_state(
     vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
   const vpx_codec_err_t res = error->error_code;

From 41ced868a69625372c95ff0b2bd5f90987516c3b Mon Sep 17 00:00:00 2001
From: Jerome Jiang <jianj@google.com>
Date: Fri, 15 Dec 2023 15:49:01 -0500
Subject: [PATCH 926/926] Remove VP9E_GET_TPL_STATS

This is never used.
A callback in external rc func was added and used instead.

Change-Id: Iade6f361072f0c28af98904baf457d2f0e9ca904
---
 test/encode_api_test.cc | 132 ----------------------------------------
 vp9/vp9_cx_iface.c      |  19 ------
 vpx/vp8cx.h             |  14 -----
 3 files changed, 165 deletions(-)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 928f1ede17..508083673a 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -944,138 +944,6 @@ TEST(EncodeAPI, Buganizer311294795) {
   encoder.Encode(false);
   encoder.Encode(false);
 }
-
-class EncodeApiGetTplStatsTest
-    : public ::libvpx_test::EncoderTest,
-      public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
- public:
-  EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {}
-  ~EncodeApiGetTplStatsTest() override = default;
-
- protected:
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_TPL, 1);
-    }
-  }
-
-  vpx_codec_err_t AllocateTplList(VpxTplGopStats *data) {
-    // Allocate MAX_ARF_GOP_SIZE (50) * sizeof(VpxTplFrameStats) that will be
-    // filled by VP9E_GET_TPL_STATS.
-    // MAX_ARF_GOP_SIZE is used here because the test doesn't know the size of
-    // each GOP before getting TPL stats from the encoder.
-    data->size = 50;
-    data->frame_stats_list =
-        static_cast<VpxTplFrameStats *>(calloc(50, sizeof(VpxTplFrameStats)));
-    if (data->frame_stats_list == nullptr) return VPX_CODEC_MEM_ERROR;
-    return VPX_CODEC_OK;
-  }
-
-  void CompareTplGopStats(const VpxTplGopStats &ref_gop_stats,
-                          const VpxTplGopStats &test_gop_stats) {
-    ASSERT_EQ(ref_gop_stats.size, test_gop_stats.size);
-    for (int frame = 0; frame < ref_gop_stats.size; frame++) {
-      const VpxTplFrameStats &ref_frame_stats =
-          ref_gop_stats.frame_stats_list[frame];
-      const VpxTplFrameStats &test_frame_stats =
-          test_gop_stats.frame_stats_list[frame];
-      ASSERT_EQ(ref_frame_stats.num_blocks, test_frame_stats.num_blocks);
-      ASSERT_EQ(ref_frame_stats.frame_width, test_frame_stats.frame_width);
-      ASSERT_EQ(ref_frame_stats.frame_height, test_frame_stats.frame_height);
-      for (int block = 0; block < ref_frame_stats.num_blocks; block++) {
-        const VpxTplBlockStats &ref_block_stats =
-            ref_frame_stats.block_stats_list[block];
-        const VpxTplBlockStats &test_block_stats =
-            test_frame_stats.block_stats_list[block];
-        ASSERT_EQ(ref_block_stats.inter_cost, test_block_stats.inter_cost);
-        ASSERT_EQ(ref_block_stats.intra_cost, test_block_stats.intra_cost);
-        ASSERT_EQ(ref_block_stats.mv_c, test_block_stats.mv_c);
-        ASSERT_EQ(ref_block_stats.mv_r, test_block_stats.mv_r);
-        ASSERT_EQ(ref_block_stats.recrf_dist, test_block_stats.recrf_dist);
-        ASSERT_EQ(ref_block_stats.recrf_rate, test_block_stats.recrf_rate);
-        ASSERT_EQ(ref_block_stats.ref_frame_index,
-                  test_block_stats.ref_frame_index);
-      }
-    }
-  }
-
-  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
-    ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
-    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
-      switch (pkt->kind) {
-        case VPX_CODEC_CX_FRAME_PKT: {
-          VpxTplGopStats tpl_stats;
-          EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK);
-          encoder->Control(VP9E_GET_TPL_STATS, &tpl_stats);
-          bool stats_not_all_zero = false;
-          for (int i = 0; i < tpl_stats.size; i++) {
-            VpxTplFrameStats *frame_stats_list = tpl_stats.frame_stats_list;
-            if (frame_stats_list[i].frame_width != 0) {
-              ASSERT_EQ(frame_stats_list[i].frame_width, width_);
-              ASSERT_EQ(frame_stats_list[i].frame_height, height_);
-              ASSERT_GT(frame_stats_list[i].num_blocks, 0);
-              ASSERT_NE(frame_stats_list[i].block_stats_list, nullptr);
-              stats_not_all_zero = true;
-            }
-          }
-          ASSERT_TRUE(stats_not_all_zero);
-          if (test_io_ && tpl_stats.size > 0) {
-            libvpx_test::TempOutFile *temp_out_file =
-                new (std::nothrow) libvpx_test::TempOutFile("w+");
-            ASSERT_NE(temp_out_file, nullptr);
-            ASSERT_NE(temp_out_file->file(), nullptr);
-            vpx_write_tpl_gop_stats(temp_out_file->file(), &tpl_stats);
-            rewind(temp_out_file->file());
-            VpxTplGopStats gop_stats_io;
-            ASSERT_EQ(
-                vpx_read_tpl_gop_stats(temp_out_file->file(), &gop_stats_io),
-                VPX_CODEC_OK);
-            CompareTplGopStats(gop_stats_io, tpl_stats);
-            vpx_free_tpl_gop_stats(&gop_stats_io);
-            delete temp_out_file;
-          }
-          free(tpl_stats.frame_stats_list);
-          break;
-        }
-        default: break;
-      }
-    }
-  }
-
-  int width_;
-  int height_;
-  bool test_io_;
-};
-
-TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
-  cfg_.g_lag_in_frames = 25;
-  width_ = 352;
-  height_ = 288;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
-                                       height_, 30, 1, 0, 50);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-
-TEST_P(EncodeApiGetTplStatsTest, GetTplStatsIO) {
-  cfg_.g_lag_in_frames = 25;
-  width_ = 352;
-  height_ = 288;
-  test_io_ = true;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
-                                       height_, 30, 1, 0, 50);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    VP9, EncodeApiGetTplStatsTest,
-    ::testing::Values(
-        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
 #endif  // CONFIG_VP9_ENCODER
 
 }  // namespace
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 4899b1ed12..2e2c8176d4 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1812,24 +1812,6 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
-                                          va_list args) {
-  VP9_COMP *const cpi = ctx->cpi;
-  VpxTplGopStats *data = va_arg(args, VpxTplGopStats *);
-  VpxTplFrameStats *frame_stats_list = cpi->tpl_gop_stats.frame_stats_list;
-  int i;
-  if (data == NULL) {
-    return VPX_CODEC_INVALID_PARAM;
-  }
-  data->size = cpi->tpl_gop_stats.size;
-
-  for (i = 0; i < data->size; i++) {
-    data->frame_stats_list[i] = frame_stats_list[i];
-  }
-
-  return VPX_CODEC_OK;
-}
-
 static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
@@ -2089,7 +2071,6 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
   { VP9E_GET_LEVEL, ctrl_get_level },
   { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config },
-  { VP9E_GET_TPL_STATS, ctrl_get_tpl_stats },
 
   { -1, NULL },
 };
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index d098c4c985..2875e185e6 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -768,18 +768,6 @@ enum vp8e_enc_control_id {
    *
    */
   VP9E_SET_QUANTIZER_ONE_PASS,
-
-  /*!\brief Codec control to get TPL stats for the current GOP.
-   *
-   * Allocation and free of memory of size MAX_ARF_GOP_SIZE (50) *
-   * sizeof(VpxTplFrameStats) should be done by applications.
-   *
-   * VPX_CODEC_INVALID_PARAM will be returned if the pointer passed in is NULL.
-   *
-   * Supported in codecs: VP9
-   *
-   */
-  VP9E_GET_TPL_STATS,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -1110,8 +1098,6 @@ VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
 VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
 #define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
-VPX_CTRL_USE_TYPE(VP9E_GET_TPL_STATS, void *)
-#define VPX_CTRL_VP9E_GET_TPL_STATS
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */